In [1]:
import numpy as np
import tensorflow as tf
import random as rn
import os
import matplotlib.pyplot as plt
%matplotlib inline
os.environ['PYTHONHASHSEED'] = '0'
import sys 
import scipy
import math
import sys
import pandas as pd
from scipy.ndimage.filters import gaussian_filter1d
from sklearn.metrics import mean_squared_error
from scipy.stats import linregress
from scipy import interpolate
from scipy import signal
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy.stats import linregress
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge,Lasso
from sklearn.svm import SVR
from video_process_utils import *

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters


In [2]:
target_col = 'GDI'

In [3]:
alldata_processed =\
    pd.read_csv("./data/processed/alldata_processed_with_dev_residual.csv" )
alldata_processed['videoid'] = alldata_processed['videoid'].apply(lambda x: int(x))
alldata_processed['target_count'] = alldata_processed.groupby('videoid')[target_col].transform(lambda x: x.count())

In [4]:
HOME_DIR = "./"

In [5]:
datasplit_df = pd.read_csv('%sdata/processed/train_test_valid_id_split.csv' % (HOME_DIR))
datasplit_df['videoid'] = datasplit_df['videoid'].apply(lambda x: int(x))
all_ids = set(datasplit_df['videoid'])
train_ids = set(datasplit_df[datasplit_df['dataset'] == 'train']['videoid'])
validation_ids = set(datasplit_df[datasplit_df['dataset'] == 'validation']['videoid'])
test_ids = set(datasplit_df[datasplit_df['dataset'] == 'test']['videoid'])

In [6]:
with open('./data/processed/all_processed_videos.pickle', 'rb') as handle:
    processed_videos = pickle.load(handle)

In [7]:
processed_video_ids = [x[0] for x in processed_videos if x[0] in all_ids]

In [8]:
videos = [x[1][:500,:] for x in processed_videos if x[0] in all_ids]

In [9]:
LANGLE_ANK_KNE_HIP = 50
RANGLE_ANK_KNE_HIP = 51
LANGLE_BTO_ANK_KNE = 52
RANGLE_BTO_ANK_KNE = 53
LDIST_BTO_ANK = 54
RDIST_BTO_ANK = 55
XDIST_LANK_RANK = 56
XDIST_RANK_LANK = 57

In [10]:
features_df = pd.DataFrame(processed_video_ids,columns=['videoid'])

In [11]:
def add_percentiles_xy(df,videos,column_left,column_right,column_name,percentile):
    df = df.copy()
    name_base_L = 'p%s_L%s' % (percentile,column_name)
    name_base_R = 'p%s_R%s' % (percentile,column_name)
    df[name_base_L + '_x'] = [np.percentile(v[:,2*column_left],percentile) for v in videos]
    df[name_base_R + '_x'] = [np.percentile(v[:,2*column_right],percentile) for v in videos]
    df[name_base_L + '_y'] = [np.percentile(v[:,2*column_left+1],percentile) for v in videos]
    df[name_base_R + '_y'] = [np.percentile(v[:,2*column_right+1],percentile) for v in videos]
    return df

In [12]:
def add_percentiles(df,videos,column_idx,column_name,percentile):
    df[column_name] = [np.percentile(v[:,column_idx],percentile) for v in videos]

In [13]:
def apply_transform(df,videos,col_name,col_idx,fn):
    df[col_name] = [fn(v[:,col_idx]) for v in videos]

In [14]:
for percentile in [10,25,50,75,90]:
    fn = lambda x: np.percentile(x,percentile)
    for keypoint,idx in [('LANK',LANK),('RANK',RANK),('LKNE',LKNE),('RKNE',RKNE),
                         ('LHIP',LHIP),('RHIP',RHIP),('LBTO',LBTO),('RBTO',RBTO)]:
        apply_transform(features_df,videos,'p%s_%s_x' % (percentile,keypoint),2*idx,fn)
        apply_transform(features_df,videos,'p%s_%s_y' % (percentile,keypoint),2*idx+1,fn)
        
    for keypoint,idx in [('LANGLE_ANK_KNE_HIP',LANGLE_ANK_KNE_HIP),('RANGLE_ANK_KNE_HIP',RANGLE_ANK_KNE_HIP),
                         ('LANGLE_BTO_ANK_KNE',LANGLE_BTO_ANK_KNE),('RANGLE_BTO_ANK_KNE',RANGLE_BTO_ANK_KNE),
                         ('LDIST_BTO_ANK',LDIST_BTO_ANK),('RDIST_BTO_ANK',RDIST_BTO_ANK),
                         ('XDIST_LANK_RANK',XDIST_LANK_RANK),('XDIST_RANK_LANK',XDIST_RANK_LANK)]:
        apply_transform(features_df,videos,'p%s_%s' % (percentile,keypoint),idx,fn)

In [15]:
fn = np.std
for keypoint,idx in [('LANK',LANK),('RANK',RANK),('LKNE',LKNE),('RKNE',RKNE),
                     ('LHIP',LHIP),('RHIP',RHIP),('LBTO',LBTO),('RBTO',RBTO)]:
    apply_transform(features_df,videos,'std_%s_x' % (keypoint),2*idx,fn)
    apply_transform(features_df,videos,'std_%s_y' % (keypoint),2*idx+1,fn)

for keypoint,idx in [('LANGLE_ANK_KNE_HIP',LANGLE_ANK_KNE_HIP),('RANGLE_ANK_KNE_HIP',RANGLE_ANK_KNE_HIP),
                     ('LANGLE_BTO_ANK_KNE',LANGLE_BTO_ANK_KNE),('RANGLE_BTO_ANK_KNE',RANGLE_BTO_ANK_KNE),
                     ('LDIST_BTO_ANK',LDIST_BTO_ANK),('RDIST_BTO_ANK',RDIST_BTO_ANK),
                     ('XDIST_LANK_RANK',XDIST_LANK_RANK),('XDIST_RANK_LANK',XDIST_RANK_LANK)]:
    apply_transform(features_df,videos,'std_%s' % (keypoint),idx,fn)

In [16]:
def orient_columns(df,left_col_name,right_col_name,col_name):
    df[col_name] = df.apply(lambda row: row[left_col_name] if row.side == 'L' else
                                           row[right_col_name],axis=1)

In [17]:
final_df = features_df.merge(right=alldata_processed[['side','videoid',target_col,"cadence","speed","height"]],on=['videoid'],how='inner')
final_df = final_df.merge(right=datasplit_df[['videoid','dataset']],on=['videoid'],how='inner')

In [18]:
Xcols = []
for percentile in [10,25,50,75,90]:
    for keypoint in ['ANK','HIP','KNE','BTO']:
        orient_columns(final_df,'p%s_L%s_x' % (percentile,keypoint),
                       'p%s_R%s_x' % (percentile,keypoint),
                       'p%s_%s_x' % (percentile,keypoint))
        orient_columns(final_df,'p%s_L%s_y' % (percentile,keypoint),
                       'p%s_R%s_y' % (percentile,keypoint),
                       'p%s_%s_y' % (percentile,keypoint))
        Xcols.append('p%s_%s_x' % (percentile,keypoint))
        Xcols.append('p%s_%s_y' % (percentile,keypoint))
        
    for keypoint in ['ANGLE_ANK_KNE_HIP','ANGLE_BTO_ANK_KNE','DIST_BTO_ANK']:
        orient_columns(final_df,'p%s_L%s' % (percentile,keypoint),
                       'p%s_R%s' % (percentile,keypoint),
                       'p%s_%s' % (percentile,keypoint))
        Xcols.append('p%s_%s' % (percentile,keypoint))  
        
    orient_columns(final_df,'p%s_XDIST_LANK_RANK' % (percentile),
                            'p%s_XDIST_RANK_LANK' % (percentile),
                            'p%s_XDIST_LANK_RANK' %(percentile))
    Xcols.append('p%s_XDIST_LANK_RANK' %(percentile))

In [19]:
for keypoint in ['ANK','HIP','KNE','BTO']:
    orient_columns(final_df,'std_L%s_x' % (keypoint),
                   'std_R%s_x' % (keypoint),
                   'std_%s_x' % (keypoint))
    orient_columns(final_df,'std_L%s_y' % (keypoint),
                   'std_R%s_y' % (keypoint),
                   'std_%s_y' % (keypoint))
    Xcols.append('std_%s_x' % (keypoint))
    Xcols.append('std_%s_y' % (keypoint))

for keypoint in ['ANGLE_ANK_KNE_HIP','ANGLE_BTO_ANK_KNE','DIST_BTO_ANK']:
    orient_columns(final_df,'std_L%s' % (keypoint),
                   'std_R%s' % (keypoint),
                   'std_%s' % (keypoint))
    Xcols.append('std_%s' % (keypoint))  

orient_columns(final_df,'std_XDIST_LANK_RANK' ,
                        'std_XDIST_RANK_LANK' ,
                        'std_XDIST_LANK_RANK')
Xcols.append('std_XDIST_LANK_RANK')

In [20]:
X_train = final_df[final_df['dataset'] == 'train'][Xcols].values
y_train = final_df[final_df['dataset'] == 'train'][target_col].values

X = final_df[Xcols].values

In [21]:
from sklearn.ensemble import RandomForestRegressor

In [22]:
sc = StandardScaler()
rr = Ridge()
rf = RandomForestRegressor()

In [23]:
pipe_rr = Pipeline([('sc', sc), ('rr', rr)])

In [24]:
def evaluate_model(mod,df):
    df['%s_pred' % (target_col)] = mod.predict(X)
    metrics = {}
    for dataset in ['train','validation','test']:
        tmp = df[df['dataset'] == dataset]
        c = tmp.corr()['%s' % (target_col)]['%s_pred' % (target_col)]
        rmse =  np.sqrt(mean_squared_error(tmp['%s_pred' % (target_col)],
                                           tmp['%s' % (target_col)]))
        metrics[dataset] = (c,rmse)
    return metrics

In [25]:
results_rr = []
for alpha in [0.001,0.01,0.1,1.0,10,100,1000,10000]:
    print(alpha)
    pipe_rr.set_params(rr__alpha=alpha).fit(X_train,y_train)
    metrics = evaluate_model(pipe_rr,final_df)
    results_rr.append((alpha,metrics['validation'][1]))

0.001
0.01
0.1
1.0
10
100
1000
10000


In [26]:
best_alpha = results_rr[np.argmin([x[1] for x in results_rr])][0]

In [27]:
pipe_rr.set_params(rr__alpha=best_alpha).fit(X_train,y_train)
metrics = evaluate_model(pipe_rr,final_df)
metrics

{'train': (0.699949645457093, 8.747196345194654),
 'validation': (0.6228067326542078, 9.068658404904632),
 'test': (0.6763942368391738, 8.638937618810562)}

In [38]:
final_df['%s_pred' % (target_col)] = pipe_rr.predict(X)
final_df[['videoid','side','dataset','%s_pred' % (target_col)]].to_csv("./data/predictions/rr_%s_predictions.csv" % (target_col),index=False)

In [29]:
#get rr feature importances
feature_importances = pd.DataFrame(list(zip(pipe_rr.named_steps['rr'].coef_,Xcols)),columns=['coef','feature'])
feature_importances['abs_coef'] = np.abs(feature_importances['coef'])
feature_importances.sort_values(by='abs_coef',ascending=False)

Unnamed: 0,coef,feature,abs_coef
32,3.081757,p50_ANGLE_ANK_KNE_HIP,3.081757
44,2.717357,p75_ANGLE_ANK_KNE_HIP,2.717357
16,-2.596436,p25_KNE_x,2.596436
39,-2.225145,p75_HIP_y,2.225145
15,2.164033,p25_HIP_y,2.164033
10,-2.131013,p10_DIST_BTO_ANK,2.131013
21,2.111405,p25_ANGLE_BTO_ANK_KNE,2.111405
40,-1.945093,p75_KNE_x,1.945093
4,1.779733,p10_KNE_x,1.779733
22,1.759181,p25_DIST_BTO_ANK,1.759181


In [30]:
n_estimators = [100]
max_features = ['auto', 'sqrt']
max_depth = list(range(10,110,10))
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 5]
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
def random_search_rf_estimator(param_grid):
    rf = RandomForestRegressor()
    selected_params = {}
    for k in param_grid.keys():
        selected_params[k] = np.random.choice(param_grid[k])
    rf.set_params(**selected_params)
    return rf

In [31]:
rf_results = []
np.random.seed(1)
n_iters = 5
for i in range(n_iters):
    print(i)
    rf = random_search_rf_estimator(param_grid)
    rf.fit(X_train,y_train)   
    metrics = evaluate_model(rf,final_df)
    rf_results.append((rf.get_params(),metrics['validation'][1]))

0
1
2
3
4


In [32]:
optimal_rf_params = rf_results[np.argmin([x[1] for x in rf_results])][0]

In [33]:
rf.set_params(**optimal_rf_params)
metrics = evaluate_model(rf,final_df)

In [34]:
metrics

{'train': (0.9704782251659252, 3.6645531688927604),
 'validation': (0.6576822555198193, 8.697031258814377),
 'test': (0.7028402024146129, 8.372115069545575)}

In [36]:
feature_importances = pd.DataFrame(list(zip(Xcols,rf.feature_importances_)),columns=['feature','feature_importance'])
feature_importances.sort_values(by='feature_importance',ascending=False)

Unnamed: 0,feature,feature_importance
63,std_HIP_y,0.043427
44,p75_ANGLE_ANK_KNE_HIP,0.043135
6,p10_BTO_x,0.040769
18,p25_BTO_x,0.036187
61,std_ANK_y,0.033104
71,std_XDIST_LANK_RANK,0.032984
32,p50_ANGLE_ANK_KNE_HIP,0.032548
59,p90_XDIST_LANK_RANK,0.030224
45,p75_ANGLE_BTO_ANK_KNE,0.025717
11,p10_XDIST_LANK_RANK,0.024676
