In [15]:
import numpy as np
import tensorflow as tf
import random as rn
import os
import matplotlib.pyplot as plt
%matplotlib inline
os.environ['PYTHONHASHSEED'] = '0'
import sys 
import scipy
import math
import sys
import pandas as pd
from scipy.ndimage.filters import gaussian_filter1d
from sklearn.metrics import mean_squared_error
from scipy.stats import linregress
from scipy import interpolate
from scipy import signal
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy.stats import linregress
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge,Lasso
from sklearn.svm import SVR
from video_process_utils import *

In [16]:
#target_col = 'KneeFlex_maxExtension'
target_col = 'SEMLS_dev_residual'

In [17]:
alldata_processed =\
    pd.read_csv("data/processed/alldata_processed_with_dev_residual.csv" )
alldata_processed['videoid'] = alldata_processed['videoid'].apply(lambda x: int(x))
alldata_processed['target_count'] = alldata_processed.groupby('videoid')[target_col].transform(lambda x: x.count())

In [18]:
HOME_DIR = "./"

In [19]:
datasplit_df = pd.read_csv('%sdata/processed/train_test_valid_id_split.csv' % (HOME_DIR))
datasplit_df['videoid'] = datasplit_df['videoid'].apply(lambda x: int(x))
all_ids = set(datasplit_df['videoid'])
train_ids = set(datasplit_df[datasplit_df['dataset'] == 'train']['videoid'])
validation_ids = set(datasplit_df[datasplit_df['dataset'] == 'validation']['videoid'])
test_ids = set(datasplit_df[datasplit_df['dataset'] == 'test']['videoid'])

In [21]:
with open('./data/processed/all_processed_videos.pickle', 'rb') as handle:
    processed_videos = pickle.load(handle)

In [22]:
processed_video_ids = [x[0] for x in processed_videos if x[0] in all_ids]

In [23]:
videos = [x[1][:500,:] for x in processed_videos if x[0] in all_ids]

In [24]:
LANGLE_ANK_KNE_HIP = 50
RANGLE_ANK_KNE_HIP = 51
LANGLE_BTO_ANK_KNE = 52
RANGLE_BTO_ANK_KNE = 53
LDIST_BTO_ANK = 54
RDIST_BTO_ANK = 55
XDIST_LANK_RANK = 56
XDIST_RANK_LANK = 57

In [25]:
features_df = pd.DataFrame(processed_video_ids,columns=['videoid'])

In [26]:
def add_percentiles_xy(df,videos,column_left,column_right,column_name,percentile):
    df = df.copy()
    name_base_L = 'p%s_L%s' % (percentile,column_name)
    name_base_R = 'p%s_R%s' % (percentile,column_name)
    df[name_base_L + '_x'] = [np.percentile(v[:,2*column_left],percentile) for v in videos]
    df[name_base_R + '_x'] = [np.percentile(v[:,2*column_right],percentile) for v in videos]
    df[name_base_L + '_y'] = [np.percentile(v[:,2*column_left+1],percentile) for v in videos]
    df[name_base_R + '_y'] = [np.percentile(v[:,2*column_right+1],percentile) for v in videos]
    return df

In [27]:
def add_percentiles(df,videos,column_idx,column_name,percentile):
    df[column_name] = [np.percentile(v[:,column_idx],percentile) for v in videos]

In [28]:
def apply_transform(df,videos,col_name,col_idx,fn):
    df[col_name] = [fn(v[:,col_idx]) for v in videos]

In [29]:
for percentile in [10,25,50,75,90]:
    fn = lambda x: np.percentile(x,percentile)
    for keypoint,idx in [('LANK',LANK),('RANK',RANK),('LKNE',LKNE),('RKNE',RKNE),
                         ('LHIP',LHIP),('RHIP',RHIP),('LBTO',LBTO),('RBTO',RBTO)]:
        apply_transform(features_df,videos,'p%s_%s_x' % (percentile,keypoint),2*idx,fn)
        apply_transform(features_df,videos,'p%s_%s_y' % (percentile,keypoint),2*idx+1,fn)
        
    for keypoint,idx in [('LANGLE_ANK_KNE_HIP',LANGLE_ANK_KNE_HIP),('RANGLE_ANK_KNE_HIP',RANGLE_ANK_KNE_HIP),
                         ('LANGLE_BTO_ANK_KNE',LANGLE_BTO_ANK_KNE),('RANGLE_BTO_ANK_KNE',RANGLE_BTO_ANK_KNE),
                         ('LDIST_BTO_ANK',LDIST_BTO_ANK),('RDIST_BTO_ANK',RDIST_BTO_ANK),
                         ('XDIST_LANK_RANK',XDIST_LANK_RANK),('XDIST_RANK_LANK',XDIST_RANK_LANK)]:
        apply_transform(features_df,videos,'p%s_%s' % (percentile,keypoint),idx,fn)

In [30]:
fn = np.std
for keypoint,idx in [('LANK',LANK),('RANK',RANK),('LKNE',LKNE),('RKNE',RKNE),
                     ('LHIP',LHIP),('RHIP',RHIP),('LBTO',LBTO),('RBTO',RBTO)]:
    apply_transform(features_df,videos,'std_%s_x' % (keypoint),2*idx,fn)
    apply_transform(features_df,videos,'std_%s_y' % (keypoint),2*idx+1,fn)

for keypoint,idx in [('LANGLE_ANK_KNE_HIP',LANGLE_ANK_KNE_HIP),('RANGLE_ANK_KNE_HIP',RANGLE_ANK_KNE_HIP),
                     ('LANGLE_BTO_ANK_KNE',LANGLE_BTO_ANK_KNE),('RANGLE_BTO_ANK_KNE',RANGLE_BTO_ANK_KNE),
                     ('LDIST_BTO_ANK',LDIST_BTO_ANK),('RDIST_BTO_ANK',RDIST_BTO_ANK),
                     ('XDIST_LANK_RANK',XDIST_LANK_RANK),('XDIST_RANK_LANK',XDIST_RANK_LANK)]:
    apply_transform(features_df,videos,'std_%s' % (keypoint),idx,fn)

In [31]:
def orient_columns(df,left_col_name,right_col_name,col_name):
    df[col_name] = df.apply(lambda row: row[left_col_name] if row.side == 'L' else
                                           row[right_col_name],axis=1)

In [32]:
final_df = features_df.merge(right=alldata_processed[['side','videoid',target_col]],on=['videoid'],how='inner')
final_df = final_df.merge(right=datasplit_df[['videoid','dataset']],on=['videoid'],how='inner')

In [33]:
Xcols = []
for percentile in [10,25,50,75,90]:
    for keypoint in ['ANK','HIP','KNE','BTO']:
        orient_columns(final_df,'p%s_L%s_x' % (percentile,keypoint),
                       'p%s_R%s_x' % (percentile,keypoint),
                       'p%s_%s_x' % (percentile,keypoint))
        orient_columns(final_df,'p%s_L%s_y' % (percentile,keypoint),
                       'p%s_R%s_y' % (percentile,keypoint),
                       'p%s_%s_y' % (percentile,keypoint))
        Xcols.append('p%s_%s_x' % (percentile,keypoint))
        Xcols.append('p%s_%s_y' % (percentile,keypoint))
        
    for keypoint in ['ANGLE_ANK_KNE_HIP','ANGLE_BTO_ANK_KNE','DIST_BTO_ANK']:
        orient_columns(final_df,'p%s_L%s' % (percentile,keypoint),
                       'p%s_R%s' % (percentile,keypoint),
                       'p%s_%s' % (percentile,keypoint))
        Xcols.append('p%s_%s' % (percentile,keypoint))  
        
    orient_columns(final_df,'p%s_XDIST_LANK_RANK' % (percentile),
                            'p%s_XDIST_RANK_LANK' % (percentile),
                            'p%s_XDIST_LANK_RANK' %(percentile))
    Xcols.append('p%s_XDIST_LANK_RANK' %(percentile))

In [34]:
for keypoint in ['ANK','HIP','KNE','BTO']:
    orient_columns(final_df,'std_L%s_x' % (keypoint),
                   'std_R%s_x' % (keypoint),
                   'std_%s_x' % (keypoint))
    orient_columns(final_df,'std_L%s_y' % (keypoint),
                   'std_R%s_y' % (keypoint),
                   'std_%s_y' % (keypoint))
    Xcols.append('std_%s_x' % (keypoint))
    Xcols.append('std_%s_y' % (keypoint))

for keypoint in ['ANGLE_ANK_KNE_HIP','ANGLE_BTO_ANK_KNE','DIST_BTO_ANK']:
    orient_columns(final_df,'std_L%s' % (keypoint),
                   'std_R%s' % (keypoint),
                   'std_%s' % (keypoint))
    Xcols.append('std_%s' % (keypoint))  

orient_columns(final_df,'std_XDIST_LANK_RANK' ,
                        'std_XDIST_RANK_LANK' ,
                        'std_XDIST_LANK_RANK')
Xcols.append('std_XDIST_LANK_RANK')

In [35]:
X_train = final_df[final_df['dataset'] == 'train'][Xcols].values
y_train = final_df[final_df['dataset'] == 'train'][target_col].values

X = final_df[Xcols].values

In [36]:
from sklearn.ensemble import RandomForestRegressor

In [37]:
sc = StandardScaler()
rr = Ridge()
rf = RandomForestRegressor()

In [38]:
pipe_rr = Pipeline([('sc', sc), ('rr', rr)])

In [39]:
def evaluate_model(df):
    metrics = {}
    for dataset in ['train','validation','test']:
        tmp = df[df['dataset'] == dataset]
        c = tmp.corr()['%s' % (target_col)]['%s_pred' % (target_col)]
        rmse =  np.sqrt(mean_squared_error(tmp['%s_pred' % (target_col)],
                                           tmp['%s' % (target_col)]))
        metrics[dataset] = (c,rmse)
    return metrics

In [40]:
results_rr = []
for alpha in [10**-4,10**-3,10**-2,10**-1,1,10*1,10**2,10**3,10**4]:
    print(alpha)
    pipe_rr.set_params(rr__alpha=alpha).fit(X_train,y_train)
    final_df['%s_pred' % (target_col)] = pipe_rr.predict(X)
    metrics = evaluate_model(final_df)
    results_rr.append((alpha,metrics['validation'][1]))

0.0001
0.001
0.01
0.1
1
10
100
1000
10000


In [41]:
best_alpha = results_rr[np.argmin([x[1] for x in results_rr])][0]

In [42]:
pipe_rr.set_params(rr__alpha=best_alpha).fit(X_train,y_train)
final_df['%s_pred' % (target_col)] = pipe_rr.predict(X)
final_df[['videoid','side','dataset','%s_pred' % (target_col)]].to_csv("./data/predictions/rr_%s_predictions.csv" % (target_col),index=False)

In [43]:
metrics = evaluate_model(final_df)
metrics

{'train': (0.23245101188633166, 0.8889508743888394),
 'validation': (0.2024362860992509, 0.9383964688558275),
 'test': (0.18859273691067144, 0.9048128089751101)}

In [47]:
#get rr feature importances
feature_importances = pd.DataFrame(list(zip(pipe_rr.named_steps['rr'].coef_,Xcols)),columns=['coef','feature'])
feature_importances['abs_coef'] = np.abs(feature_importances['coef'])
feature_importances.sort_values(by='abs_coef',ascending=False)
feature_importances.to_csv("./figures/feature_importances_%s_rr.csv" % (target_col),index=False)

In [48]:
n_estimators = [100]
max_features = ['auto', 'sqrt']
max_depth = list(range(10,110,10))
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 5]
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
def random_search_rf_estimator(param_grid):
    rf = RandomForestRegressor()
    selected_params = {}
    for k in param_grid.keys():
        selected_params[k] = np.random.choice(param_grid[k])
    rf.set_params(**selected_params)
    return rf

In [49]:
rf_results = []
np.random.seed(1)
n_iters = 20
for i in range(n_iters):
    print(i)
    rf = random_search_rf_estimator(param_grid)
    rf.fit(X_train,y_train)   
    final_df['%s_pred' % (target_col)] = rf.predict(X)
    metrics = evaluate_model(final_df)
    rf_results.append((rf.get_params(),metrics['validation'][1]))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [55]:
optimal_rf_params = rf_results[np.argmin([x[1] for x in rf_results])][0]

In [56]:
optimal_rf_params

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': 30,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [57]:
rf.set_params(**optimal_rf_params)
final_df['%s_pred' % (target_col)] = rf.predict(X)
final_df[['videoid','side','dataset','%s_pred' % (target_col)]].to_csv("./data/predictions/rf_%s_predictions.csv" % (target_col),index=False)

In [58]:
metrics = evaluate_model(final_df)
metrics

{'train': (0.9075179411394165, 0.5584497473912653),
 'validation': (0.1748334320037454, 0.9436267841810423),
 'test': (0.17266436949104574, 0.9149140405680459)}

In [61]:
feature_importances = pd.DataFrame(list(zip(Xcols,rf.feature_importances_)),columns=['feature','feature_importance'])
feature_importances.sort_values(by='feature_importance',ascending=False)
feature_importances.to_csv("./figures/feature_importances_%s_rf.csv" % (target_col),index=False)