In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import xgboost
%matplotlib inline
pd.set_option('display.max_colwidth',999)



### Make movement dictionary to use in vectorizor and a relationship dictionary to relate movement with movement class

In [2]:
def movements_data(csv_file):
    movements = pd.read_csv(csv_file, index_col = 'Unnamed: 0')
    # add movements that might have mulitple formats
    add_moves = [{'movement':'double-under','frequency':153,'move_class':'Monostructural','Equipment':'jump rope'},{'movement':'squat','frequency':63,'move_class':'Gymnastics','Equipment':'body'}]
    am = pd.DataFrame(add_moves)
    return pd.concat([movements,am], ignore_index=True)

In [3]:
def make_dicts(use): #fill in Dictionary needed 'CV', 'mgw', 'equip', or 'other'
    new_move = movements_data('movements.csv')
    
    if use == 'CV':
        move_dict = new_move.movement.to_dict()
        return dict((v,k) for k,v in move_dict.iteritems())
    
    if use == 'mgw' or use == 'equip':
        new_move_dict = new_move[['movement','move_class','Equipment']].to_dict(orient='records')
        relate_dict = {}
        equip_dict = {}
        for row in new_move_dict:
            relate_dict[row['movement']] = row['move_class']
            equip_dict[row['movement']] = row['Equipment']
        if use == 'equip':
            return equip_dict
        else:
            return relate_dict
    
    if use == 'other':
        eq_dict = new_move.Equipment.to_dict()
        other_dict = dict((v,k) for k,v in eq_dict.iteritems())
        bar = [k for k in other_dict.iterkeys()]
        for i,k in enumerate(bar):
            other_dict[k]=i
        return other_dict

### Read in initial data from Beyond the Whiteboard and locate the rows that represent workouts for time or for reps

In [4]:
#takes in data.csv from btwb and converts to the dataframe to use for the model
def model_df(data_csv):
    df = pd.read_csv(data_csv)
    df1 = df.loc[(df['Work performed'] > 0) & (df['Work time'] > 0),:].copy()
    return df1.reset_index(drop = True)

### Create functions to feed into Feature Union and Pipeline

In [5]:
def base_features_wt(df):
    return df[['Work time']]


base_features_wt_tf = FunctionTransformer(base_features_wt, validate=False)

In [6]:
def base_features_wp(df):
    return df[['Work performed']]

base_features_wp_tf = FunctionTransformer(base_features_wp, validate=False)

In [7]:
def get_movements(df):
    move_in_work = []
    new_move = movements_data('movements.csv')
    for w in [x for x in df['Description']]:
        moves = []
        row = {}
        for m in new_move.movement:
            if m in w.lower():
                moves.append(m)
        row['move'] = ', '.join(moves)
        move_in_work.append(row)
    return pd.DataFrame(move_in_work)['move']

get_movements_tf = FunctionTransformer(get_movements, validate=False)
        

In [8]:
def equipment_used(cell):
    equipment_used = []
    equip_list = cell.split(', ')
    equip_dict = make_dicts('equip')
    for x in equip_list:
        if x in equip_dict:
            equipment_used.append(equip_dict[x])
    return ', '.join(sorted(set(equipment_used))) 


In [9]:
def get_equip(df):
    return pd.DataFrame(get_movements(df).apply(equipment_used))['move']
    
get_equip_tf = FunctionTransformer(get_equip, validate=False)

In [10]:
def mgw_breakdown(cell):
    mgw = []
    move_list = cell.split(', ')
    relate_dict = make_dicts('mgw')
    for x in move_list:
        if x in relate_dict:
            mgw.append(relate_dict[x][0])
    return ''.join(sorted(set(mgw)))

In [11]:
def get_move_class(df):
    breakdown = pd.DataFrame(get_movements(df).apply(mgw_breakdown)) 
    for let in ['G','M','W']:
        breakdown[let] = breakdown['move'].apply(lambda x: 1 if re.search(let,x) else 0)
    return breakdown[['G','M','W']]

get_move_class_tf = FunctionTransformer(get_move_class, validate=False)

In [12]:
def time_or_reps(df):
    time_or_reps = pd.DataFrame(df['Formatted Result'].apply(lambda x: 'Time' if re.search(r'(min|sec)', x) else 'Reps'))
    return pd.get_dummies(time_or_reps, prefix='For')

time_or_reps_tf = FunctionTransformer(time_or_reps, validate=False)

In [13]:
def is_prescribed(df):
    return df['Prescribed'].apply(int).to_frame()

is_prescribed_tf = FunctionTransformer(is_prescribed, validate=False)    

In [14]:
def get_year(df):
    years = pd.Categorical(pd.to_datetime(df['Date']).dt.year, categories = range(2011,2018))
    return pd.get_dummies(years, columns=['Date'], prefix='Year')

get_year_tf = FunctionTransformer(get_year, validate=False)

In [15]:
def get_month(df):
    months = pd.DataFrame(pd.to_datetime(df['Date']).dt.month)
    return pd.get_dummies(months, columns=['Date'], prefix='Month')

get_month_tf = FunctionTransformer(get_month, validate=False)

In [16]:
equip_pipeline = Pipeline([
    ('get_equip', get_equip_tf),
    ('eq_vect', CountVectorizer(vocabulary=make_dicts('eq'))),
    ('eq_nonsparse',FunctionTransformer(lambda X: X.toarray(), validate=False)) 
])

In [17]:
vector_pipeline = Pipeline([
    ('get_movements', get_movements_tf),
    ('vect', CountVectorizer(vocabulary=make_dicts('CV'))),
    ('nonsparse',FunctionTransformer(lambda X: X.toarray(), validate=False)) 
])

In [18]:
features_wt = FeatureUnion([
    #('get_month', get_month_tf),
    #('get_year', get_year_tf),
    ('equip_pipeline', equip_pipeline),
    #('is_prescribed', is_prescribed_tf),
    #('time_or_reps', time_or_reps_tf),
    ('get_move_class', get_move_class_tf),
    ('vector_pipeline', vector_pipeline),
    ('base_features_wt', base_features_wt_tf)
])

In [19]:
features_wp = FeatureUnion([
    #('get_month', get_month_tf),
    #('get_year', get_year_tf),
    ('equip_pipeline', equip_pipeline),
    #('is_prescribed', is_prescribed_tf),
    #('time_or_reps', time_or_reps_tf),
    ('get_move_class', get_move_class_tf),
    ('vector_pipeline', vector_pipeline),
    ('base_features_wp', base_features_wp_tf)
])

In [126]:
pipe_wt = Pipeline([
    ('features_wt', features_wt),
    #('ss', StandardScaler()),
    #('lr', LinearRegression(fit_intercept=False, normalize=True))
    #('rf', RandomForestRegressor())
    ('gb', GradientBoostingRegressor())
    #('dt', DecisionTreeRegressor())
])

In [127]:
pipe_wp = Pipeline([
    ('features_wp', features_wp),
    #('ss', StandardScaler()),
    #('lr', LinearRegression(fit_intercept=False, normalize=True))
    #('rf', RandomForestRegressor())
    ('gb', GradientBoostingRegressor())
    #('dt', DecisionTreeRegressor())
])

In [230]:
def get_prediction(discription,wt,wp):
    #disription is workout discription, wt = worktime given for AMRAPs, wp = work performed for RFT
    df1 = model_df('data.csv')
    test = [{'Work time': wt, 'Description': discription, 'Work performed': wp}]
    df1 = df1.append(test, ignore_index=True)
    vect = TfidfVectorizer()

    tfid_matrix = vect.fit_transform(df1['Description'])
    des_distance = pairwise_distances(tfid_matrix, metric='cosine')
    new_df = pd.DataFrame(des_distance, index=df1.index.values, columns=df1['Description'].values)
    cos_sim = new_df[discription][new_df[discription]<0.87]
    if len(cos_sim.shape) > 1:
        s = new_df[discription].mean(axis=1)
        cos_sim = s[s<0.25]
    #if cos_sim.min < 0.25:
        #cos_sim = cos_sim[cos_sim < 0.25]
    print cos_sim.shape
    df_test=df1.iloc[list(cos_sim[:-1].index),:].copy()

    if wt:  
        pipe_wt.fit(df_test,df_test['Work performed'])

        return pipe_wt.predict(pd.DataFrame(test))
    if wp:  
        pipe_wp.fit(df_test,df_test['Work time'])
        return pipe_wp.predict(df1.iloc[df1.shape[0]-1:df1.shape[0],:])


In [231]:
def get_formatted_pred(discription, wt, wp, round_reps, round_work):
    raw_result = get_prediction(discription, wt, wp)
    #return raw_result
    if wt:
        tot_rounds = raw_result / round_work
        splits = str(tot_rounds[0]).split('.')
        formatted_result = '{} rounds + {} reps'.format(splits[0],round(float('.'+splits[1])*round_reps,0))
        return formatted_result
    if wp:
        seconds = (raw_result[0] / 1000)%60
        minutes = ((raw_result[0] / 1000) - seconds)/60
        return '{} min, {} sec'.format(minutes,round(seconds,2))

In [226]:
get_formatted_pred('12:00 AMRAP:10 Back Squats, 95 lbs 10 Russian Kettlebell Swings, 53 lbs 10 Back Squats, 95 lbs 20 Push-up (knees)s',720000,0,50,9319)

'3 rounds + 15.0 reps'

In [236]:
get_formatted_pred('30 Clean & Jerks, 65 lbs',0,19500,None,None)

(151L,)


'10.0 min, 3.22 sec'

In [234]:
get_formatted_pred('''50 Double Unders 
50 AbMat Sit-ups
40 Double Unders
40 AbMat Sit-ups
30 Double Unders
30 AbMat Sit-ups
20 Double Unders
20 AbMat Sit-ups
10 Double Unders
10 AbMat Sit-ups''', 0,38027,None,None )

(156L,)


'9.0 min, 2.65 sec'

In [228]:
92.3*412

38027.6