In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from __future__ import division
import json
from ast import literal_eval

In [2]:
df = pd.read_json('fit_modified.json', orient = 'records', dtype={"A":str, "B":list})

In [3]:
df.columns

Index([u'antagoniststabilizers', u'comments', u'execution', u'exercisename',
       u'force', u'functional_muscle_group', u'gif', u'mechanics',
       u'preparation', u'stabilizers', u'synergists', u'target', u'utility',
       u'video'],
      dtype='object')

In [61]:
data = df[['exercisename','synergists','target','execution']]
group = df['functional_muscle_group'].apply(pd.Series)
label = group.rename(columns = lambda x : 'label' + str(x))
label.columns = ['label']

In [5]:
label['label'].value_counts()

abs          77
back         73
leg          57
shoulders    52
glutes       41
chest         6
triceps       2
Name: label, dtype: int64

In [62]:
data.head()

Unnamed: 0,exercisename,synergists,target,execution
0,Safety Barbell Standing Leg Calf Raise,[Soleus],[Gastrocnemius],Raise heels by extending ankles as high as pos...
1,Cable Triceps Extension (with rope),[],[Triceps Brachii],Raise ends of rope overhead by extending forea...
2,Safety Bar Seated Calf Raise,[Gastrocnemius],[Soleus],Lower heels by bending ankles until calves are...
3,Sled 45° Reverse Calf Raise (plate loaded),[],[Tibialis Anterior],Pull forefoot of both feet up and back toward ...
4,Sled 45° Reverse Calf Raise,[],[Tibialis Anterior],Pull forefoot of both feet up and back toward ...


In [63]:
#unlist synergists and target  columns
data['synergists'] = data['synergists'].fillna("").apply(lambda x: " ".join(x))
data['target'] = data['target'].fillna("").apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [66]:
train = data.apply(lambda x: ' '.join(x.values.tolist()), axis=1)
train.head()

0    Safety Barbell Standing Leg Calf Raise Soleus ...
1    Cable Triceps Extension (with rope)  Triceps B...
2    Safety Bar Seated Calf Raise Gastrocnemius Sol...
3    Sled 45° Reverse Calf Raise (plate loaded)  Ti...
4    Sled 45° Reverse Calf Raise  Tibialis Anterior...
dtype: object

In [7]:
#initialize TFIDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,3),stop_words="english")
data = vectorizer.fit_transform(train_data['question'])

In [52]:
print vectorizer.get_stop_words()

frozenset(['all', 'six', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'fifty', 'four', 'not', 'own', 'through', 'yourselves', 'go', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'nei

In [51]:
# show top important words
indices = np.argsort(vectorizer.idf_)[::-1]
features = vectorizer.get_feature_names()
top_n = 15
top_features = [features[i] for i in indices[:top_n]]
print top_features

[u'zzzzzzzzzzzzzzzzzz important points', u'energy increases neat', u'energy incredible', u'pro bodybuilding title', u'energy incredible love', u'pro bodybuilding threads', u'pro bodybuilding successful', u'pro bodybuilding status', u'energy inefficient', u'energy inefficient energy', u'pro bodybuilding smaller', u'pro bodybuilding skip', u'energy infectious love', u'energy information', u'energy information helpful']


In [9]:
from sklearn.decomposition import TruncatedSVD
from sklearn.utils import shuffle
# reduce features using SVD
SVD = TruncatedSVD(n_components=100, n_iter=5, random_state=0)
train = SVD.fit_transform(data)
# shuffle data
train, train_label = shuffle(train, train_label, random_state=0)

In [10]:
# separate data into training and test
X_train, X_test, y_train, y_test = train_test_split(train, train_label, test_size=0.2, random_state=0)
print X_train.shape
print X_test.shape

(21506, 100)
(5377, 100)


In [70]:
feature_names = vectorizer.get_feature_names()
feature_names = np.asarray(feature_names)
feature_names.shape

(89989,)

In [11]:
#benchmark model
clf = RandomForestClassifier(max_depth=4)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

  app.launch_new_instance()


In [12]:
score = metrics.accuracy_score(y_test, pred)
print score

0.251441324158


In [13]:
y_label = pd.Series(y_train['group'],dtype='category')
y_label.cat.categories 
y_label.cat.categories = [0,1,2,3,4,5,6,7]

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_label, test_size=0.2, random_state=0)
print X_train.shape
print y_train.shape
print X_valid.shape
print y_valid.shape

(17204, 100)
(17204,)
(4302, 100)
(4302,)


In [15]:
y_valid

[4, 0, 5, 2, 6, ..., 3, 5, 5, 1, 5]
Length: 4302
Categories (8, int64): [0, 1, 2, 3, 4, 5, 6, 7]

In [18]:
#xgboost

params = {}
params['objective'] = 'multi:softmax'
params['eta'] = 0.01
params['max_depth'] = 5
params['num_class'] = 8

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 200, watchlist, early_stopping_rounds=10, verbose_eval=10)
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test)

[0]	train-merror:0.726808	valid-merror:0.784054
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 10 rounds.
[10]	train-merror:0.723553	valid-merror:0.7808
Stopping. Best iteration:
[8]	train-merror:0.72396	valid-merror:0.777778



In [22]:
y_test = pd.Series(y_test['group'],dtype='category')
y_test.cat.categories = [0,1,2,3,4,5,6,7]

accuracy = float(np.sum(p_test == y_test) / y_test.shape[0])
print accuracy

0.221312999814
