In [131]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
%matplotlib inline
pd.set_option('display.max_colwidth',999)

### Make movement dictionary to use in vectorizor and a relationship dictionary to relate movement with movement class

In [3]:
movements = pd.read_csv('movements.csv', index_col='Unnamed: 0')
add_moves = [{'movement':'double-under','frequency':153,'move_class':'Monostructural'},{'movement':'squat','frequency':63,'move_class':'Gymnastics'}]
am = pd.DataFrame(add_moves)
new_move = pd.concat([movements,am], ignore_index=True)
new_move_dict = new_move[['movement','move_class']].to_dict(orient='records')
move_dict = new_move.movement.to_dict()
res = dict((v,k) for k,v in move_dict.iteritems())
relate_dict = {}
for row in new_move_dict:
    relate_dict[row['movement']] = row['move_class']

### Read in initial data from Beyond the Whiteboard and locate the rows that represent workouts for time or for reps

In [117]:
df = pd.read_csv('data.csv')
df1 = df.loc[(df['Work performed'] > 0) & (df['Work time'] > 0),:].copy()

In [118]:
df1.reset_index(drop = True)

Unnamed: 0,Date,Workout,Result,Prescribed,Pukie,Work performed,Work time,Formatted Result,Notes,Description
0,2017-04-04,"5 RFT: Power Cleans, Walking Lunges, and Thrusters",3.990000e+05,True,False,42039,399000.0,6 mins 39 secs,Went out too hot,"5 rounds of:\n12 Power Cleans, 65 lbs\n6 Walking Lunges, 65 lbs\n3 Thrusters, 65 lbs"
1,2017-04-02,Cindy,1.113300e+01,True,False,88204,1200000.0,11 rounds + 2 Pull-ups | 332 reps,,20:00 AMRAP:\n5 Pull-ups\n10 Push-ups\n15 Air Squats
2,2017-03-28,21-15-9: Push Press and Box Jumps,4.190000e+05,True,False,21197,419000.0,6 mins 59 secs,"Step ups, got a set of 13 pp on the first round","21-15-9 reps of:\nPush Press, 95 lbs\nBox Jump, 20 in"
3,2017-03-26,"Row : 3x 1 km, rest 3 mins",7.715000e+05,True,False,93741,771500.0,12 mins 51.5 secs,,"Intervals : rest 3 mins\nRow, 1 km | 4:11.9\nRow, 1 km | 4:19.3\nRow, 1 km | 4:20.3"
4,2017-03-24,CrossFit Games Open 17.5,9.820000e+05,True,False,66793,982000.0,16 mins 22 secs,"All thrusters unbroken, 5 of the rounds of dubs unbroken","10 rounds of:\n9 Thrusters, 65 lbs\n35 Double Unders"
5,2017-03-22,Every 1:30 for 10:30: Front Squat,8.572896e+02,True,False,5848,630000.0,1890 lbs,,Every 1:30 for 10:30:\n2 Front Squats | 135 lbs\n2 Front Squats | 135 lbs\n2 Front Squats | 135 lbs\n2 Front Squats | 135 lbs\n2 Front Squats | 135 lbs\n2 Front Squats | 135 lbs\n2 Front Squats | 135 lbs
6,2017-03-22,AMRAP 5 mins: Squat Cleans and AbMat Sit-ups,3.810000e+00,True,False,29387,300000.0,3 rounds + 7 Squat Cleans + 13 AbMat Sit-ups | 104 reps,,"5:00 AMRAP:\n7 Squat Cleans, 75 lbs\n21 AbMat Sit-ups"
7,2017-03-22,AMRAP 5 mins: Hanging Knee Raises and Push Press,3.000000e+00,True,False,2780,300000.0,3 rounds | 84 reps,,"5:00 AMRAP:\n21 Hanging Knee Raises\n7 Push Press, 75 lbs"
8,2017-03-22,AMRAP 5 mins: Wall Balls and Pull-ups,1.600000e+00,True,False,10193,300000.0,1 round + 10 Wall Balls + 3 Pull-ups | 38 reps,Singles on pull ups,"5:00 AMRAP:\n10 Wall Balls, 14 lbs\n15 Pull-ups"
9,2017-03-21,AMRAP 10 mins: Lateral Burpee (Over Barbell)s and Power Snatches,2.833000e+00,True,False,44381,600000.0,2 rounds + 15 Lateral Burpee (Over Barbell)s + 10 Power Snatches | 85 reps,,"10:00 AMRAP:\n15 Lateral Burpee (Over Barbell)s\n15 Power Snatches, 75 lbs"


In [173]:
df1.shape

(742, 10)

### Create functions to feed into Feature Union and Pipeline

In [153]:
def base_features(df):
    return df[['Work performed']]

base_features_tf = FunctionTransformer(base_features, validate=False)

In [80]:
def get_movements(df):
    move_in_work = []
    for w in [x for x in df['Description']]:
        moves = []
        row = {}
        for m in new_move.movement:
            if m in w.lower():
                moves.append(m)
        row['move'] = ', '.join(moves)
        move_in_work.append(row)
    return pd.DataFrame(move_in_work)['move']

get_movements_tf = FunctionTransformer(get_movements, validate=False)
        

In [92]:
def mgw_breakdown(cell):
    mgw = []
    move_list = cell.split(', ')
    for x in move_list:
        if x in relate_dict:
            mgw.append(relate_dict[x][0])
    return ''.join(sorted(set(mgw)))

In [99]:
def get_move_class(df):
    breakdown = pd.DataFrame(get_movements(df).apply(mgw_breakdown)) 
    for let in ['G','M','W']:
        breakdown[let] = breakdown['move'].apply(lambda x: 1 if re.search(let,x) else 0)
    return breakdown[['G','M','W']]

get_move_class_tf = FunctionTransformer(get_move_class, validate=False)

In [122]:
def time_or_reps(df):
    time_or_reps = pd.DataFrame(df['Formatted Result'].apply(lambda x: 'Time' if re.search(r'(min|sec)', x) else 'Reps'))
    return pd.get_dummies(time_or_reps, prefix='For')

time_or_reps_tf = FunctionTransformer(time_or_reps, validate=False)

In [127]:
def is_prescribed(df):
    return df['Prescribed'].apply(int).to_frame()

is_prescribed_tf = FunctionTransformer(is_prescribed, validate=False)    

In [179]:
def get_year(df):
    years = pd.DataFrame(pd.to_datetime(df['Date']).dt.year)
    return pd.get_dummies(years, columns=['Date'], prefix='Year')

get_year_tf = FunctionTransformer(get_year, validate=False)

In [181]:
def get_month(df):
    months = pd.DataFrame(pd.to_datetime(df['Date']).dt.month)
    return pd.get_dummies(months, columns=['Date'], prefix='Month')

get_month_tf = FunctionTransformer(get_month, validate=False)

In [182]:
vector_pipeline = Pipeline([
    ('get_movements', get_movements_tf),
    ('vect', CountVectorizer(vocabulary=res)),
    ('nonsparse',FunctionTransformer(lambda X: X.toarray(), validate=False)) 
])

In [199]:
features = FeatureUnion([
    #('get_month', get_month_tf),
    #('get_year', get_year_tf),
    ('is_prescribed', is_prescribed_tf),
    ('time_or_reps', time_or_reps_tf),
    #('get_move_class', get_move_class_tf),
    ('vector_pipeline', vector_pipeline),
    ('base_features', base_features_tf)
])

In [202]:
pipe = Pipeline([
    ('features', features),
    ('ss', StandardScaler()),
    ('lr', LinearRegression())
])

kf = KFold(n_splits=3, shuffle= True ,random_state=2003)
cross_val_score(pipe, df1, y=df1['Work time'], cv=10, verbose=True).mean()

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.7s finished


0.56102909753159957