In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from __future__ import division
import json
from ast import literal_eval

In [3]:
df = pd.read_json('fit_modified_all.json', orient = 'records', dtype={"A":str, "B":list})

In [4]:
df.columns

Index([u'antagoniststabilizers', u'comments', u'execution', u'exercisename',
       u'force', u'functional_muscle_group', u'gif', u'mechanics',
       u'preparation', u'stabilizers', u'synergists', u'target', u'utility',
       u'video'],
      dtype='object')

In [5]:
data = df[['exercisename','synergists','target','execution']]
group = df['functional_muscle_group'].apply(pd.Series)
label = group.rename(columns = lambda x : 'label' + str(x))
label.columns = ['label']

In [6]:
label['label'].value_counts()

abs          144
leg          128
shoulders     98
back          73
glutes        50
biceps        48
triceps       43
chest         40
legs          13
Name: label, dtype: int64

In [7]:
data.head()

Unnamed: 0,exercisename,synergists,target,execution
0,Safety Barbell Standing Leg Calf Raise,[Soleus],[Gastrocnemius],Raise heels by extending ankles as high as pos...
1,Cable Triceps Extension (with rope),[],[Triceps Brachii],Raise ends of rope overhead by extending forea...
2,Safety Bar Seated Calf Raise,[Gastrocnemius],[Soleus],Lower heels by bending ankles until calves are...
3,Sled 45° Reverse Calf Raise (plate loaded),[],[Tibialis Anterior],Pull forefoot of both feet up and back toward ...
4,Sled 45° Reverse Calf Raise,[],[Tibialis Anterior],Pull forefoot of both feet up and back toward ...


In [8]:
#unlist synergists and target  columns
data['synergists'] = data['synergists'].fillna("").apply(lambda x: " ".join(x))
data['target'] = data['target'].fillna("").apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [9]:
train = data.apply(lambda x: ' '.join(x.values.tolist()), axis=1)
train.head()

0    Safety Barbell Standing Leg Calf Raise Soleus ...
1    Cable Triceps Extension (with rope)  Triceps B...
2    Safety Bar Seated Calf Raise Gastrocnemius Sol...
3    Sled 45° Reverse Calf Raise (plate loaded)  Ti...
4    Sled 45° Reverse Calf Raise  Tibialis Anterior...
dtype: object

In [10]:
#initialize TFIDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2),stop_words="english")
train_tfidf = vectorizer.fit_transform(train)

In [11]:
print vectorizer.get_stop_words()

frozenset(['all', 'six', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'fifty', 'four', 'not', 'own', 'through', 'yourselves', 'go', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'nei

In [12]:
# show top important words
indices = np.argsort(vectorizer.idf_)[::-1]
features = vectorizer.get_feature_names()
top_n = 15
top_features = [features[i] for i in indices[:top_n]]
print top_features

[u'wrists return', u'motion keepshoulders', u'movement oppositearm', u'movement arms', u'catch body', u'motionuntil dumbbells', u'motionuntil arms', u'motion repeat', u'motion pushing', u'motion lowerweight', u'motion lower', u'motion elbows', u'chest dumbbells', u'motion dumbbells', u'motion bringinghandles']


In [13]:
train_tfidf.shape

(637, 5280)

In [14]:
from sklearn.decomposition import TruncatedSVD
from sklearn.utils import shuffle
# reduce features using SVD
SVD = TruncatedSVD(n_components=200, n_iter=5, random_state=0)
train = SVD.fit_transform(train_tfidf)
# shuffle data
#train, label = shuffle(train, label, random_state=0)

In [15]:
# separate data into training and test
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.2, random_state=0)
print X_train.shape
print X_test.shape

(509, 200)
(128, 200)


In [16]:
feature_names = vectorizer.get_feature_names()
feature_names = np.asarray(feature_names)
feature_names.shape

(5280,)

In [17]:
#benchmark model
clf = RandomForestClassifier(max_depth=4)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

  app.launch_new_instance()


In [18]:
score = metrics.accuracy_score(y_test, pred)
print score

0.84375


In [23]:
y_label = pd.Series(y_train['label'],dtype='category')
y_label.cat.categories 
y_label.cat.categories = [0,1,2,3,4,5,6,7,8]

In [24]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_label, test_size=0.2, random_state=0)
print X_train.shape
print y_train.shape
print X_valid.shape
print y_valid.shape

(407, 200)
(407,)
(102, 200)
(102,)


In [25]:
y_valid

[0, 5, 8, 6, 4, ..., 2, 2, 1, 0, 2]
Length: 102
Categories (9, int64): [0, 1, 2, 3, ..., 5, 6, 7, 8]

In [30]:
#xgboost

params = {}
params['objective'] = 'multi:softmax'
params['eta'] = 0.01
params['max_depth'] = 4
params['num_class'] = 9

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 200, watchlist, early_stopping_rounds=10, verbose_eval=10)
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test)

[0]	train-merror:0.056511	valid-merror:0.22549
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 10 rounds.
[10]	train-merror:0.041769	valid-merror:0.215686
Stopping. Best iteration:
[8]	train-merror:0.041769	valid-merror:0.205882



In [32]:
y_test = pd.Series(y_test['label'],dtype='category')
y_test.cat.categories = [0,1,2,3,4,5,6,7,8]

accuracy = float(np.sum(p_test == y_test) / y_test.shape[0])
print accuracy

0.8203125
