In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline  
from gensim.models import Word2Vec, keyedvectors
from sklearn.model_selection import train_test_split
from __future__ import division
from nltk import sent_tokenize, word_tokenize
from wikipedia import page

In [30]:
data = pd.read_csv("user_question.csv", header=None)
data.columns = ['question','group']

In [31]:
train = data['question'].values.tolist()
label = data['group'].values.tolist()

In [32]:
corpus = train + label

In [35]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [36]:
tok_corp = [nltk.word_tokenize(sent.decode('utf-8')) for sent in corpus]

In [37]:
model = Word2Vec(tok_corp, size=50, window=5, min_count=3, workers=2)

In [45]:
model.most_similar('How')

[(u'curl', 0.2829551100730896),
 (u'want', 0.14266327023506165),
 (u'What', 0.13052792847156525),
 (u'to', 0.11642688512802124),
 (u'Shoulders', 0.08410151302814484),
 (u'abs', 0.05228054150938988),
 (u'Triceps', 0.01863226667046547),
 (u'I', 0.006797999143600464),
 (u'do', -0.019749730825424194),
 (u'Leg', -0.023615125566720963)]

In [51]:
wikipage = page("Physical fitness")

In [53]:
sentences = [word_tokenize(sent) for sent in sent_tokenize(wikipage.content)]

In [56]:
len(sentences)

137

In [57]:
wiki_model = Word2Vec(sentences, min_count=2, size=50, window=5)  

In [58]:
list(wiki_model.wv.vocab.keys())[:10]

[u'all',
 u'resistance',
 u'consists',
 u'to',
 u'must',
 u'program',
 u'helps',
 u'include',
 u'activities',
 u'Training']

In [62]:
wiki_model.most_similar('resistance')

[(u'in', 0.6016296148300171),
 (u'.', 0.5888518691062927),
 (u'such', 0.5855332016944885),
 (u'Cancer', 0.5716492533683777),
 (u'physical', 0.5606939792633057),
 (u'body', 0.5583063364028931),
 (u'work', 0.5521804094314575),
 (u'have', 0.5517875552177429),
 (u'risk', 0.5498833656311035),
 (u'recommendations', 0.534516453742981)]

In [2]:
exrx_data = pd.read_json('fit_modified_all.json', orient = 'records', dtype={"A":str, "B":list})
exrx_train = exrx_data[['exercisename','preparation','comments','execution','functional_muscle_group']]

In [175]:
exrx_train.head()

Unnamed: 0,exercisename,preparation,comments,execution,functional_muscle_group
0,Safety Barbell Standing Leg Calf Raise,Stand facing safety barbell on rack upper ches...,Position rack just below lowest range of motio...,Raise heels by extending ankles as high as pos...,leg
1,Cable Triceps Extension (with rope),"From low pulley cable, grasp ends of rope atta...",Let cable attachment pull arm back to maintain...,Raise ends of rope overhead by extending forea...,triceps
2,Safety Bar Seated Calf Raise,Place safety bar on rack approximately lower l...,Slide forward to edge of bench and rack safety...,Lower heels by bending ankles until calves are...,leg
3,Sled 45° Reverse Calf Raise (plate loaded),Sit on machine with lower back against padding...,Keep knees straight throughout exercise. Also ...,Pull forefoot of both feet up and back toward ...,leg
4,Sled 45° Reverse Calf Raise,Sit on machine with lower back against padding...,Keep knees straight throughout exercise. Also ...,Pull forefoot of both feet up and back toward ...,leg


In [3]:
#remove list
exrx_train['functional_muscle_group'] = exrx_train['functional_muscle_group'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [4]:
# Add functional muscle group to exercise name for better word2vec accuracy, otherwise muscle name never exists in corpus
exrx_train['exercise'] = exrx_train[['exercisename', 'functional_muscle_group']].apply(lambda x: ' '.join(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [5]:
corpus = exrx_train['exercise'].values.tolist() + exrx_train['preparation'].values.tolist() + \
exrx_train['comments'].values.tolist()+exrx_train['execution'].values.tolist()  

In [6]:
len(corpus)

2548

In [201]:
corpus[:10]

[u'Safety Barbell Standing Leg Calf Raise leg',
 u'Cable Triceps Extension (with rope) triceps',
 u'Safety Bar Seated Calf Raise leg',
 u'Sled 45\xb0 Reverse Calf Raise (plate loaded) leg',
 u'Sled 45\xb0 Reverse Calf Raise leg',
 u'Sled 45\xb0 Reverse Calf Press leg',
 u'Lever Seated Reverse Calf Press leg',
 u'Lever Reverse Calf Raise leg',
 u'Lever Donkey Reverse Calf Raise leg',
 u'Sled Standing Reverse Calf Raise leg']

In [7]:
sentences = [word_tokenize(sent) for sent in corpus]

In [8]:
exrx_model = Word2Vec(sentences, min_count=1, size=20, window=5)  

In [9]:
def classify_muscle_group(sentence):
    w = filter(lambda x: x in exrx_model.wv, sentence.split())
    if not w:
        return "None"
    muscles = ['shoulders','biceps','back','triceps','leg','glutes','chest','abs']
    candidate = []
    candidate.append(exrx_model.wv.n_similarity(w,['shoulders']))
    candidate.append(exrx_model.wv.n_similarity(w,['biceps']))
    candidate.append(exrx_model.wv.n_similarity(w,['back']))
    candidate.append(exrx_model.wv.n_similarity(w,['triceps']))
    candidate.append(exrx_model.wv.n_similarity(w,['leg']))
    candidate.append(exrx_model.wv.n_similarity(w,['glutes']))
    candidate.append(exrx_model.wv.n_similarity(w,['chest']))
    candidate.append(exrx_model.wv.n_similarity(w,['abs']))
    return muscles[np.argmax(candidate)]

In [10]:
classify_muscle_group("how to do push press")

'back'

In [11]:
classify_muscle_group("how to do barbell bent over row")

'back'

In [11]:
classify_muscle_group("Reverse crunch")

'abs'

In [25]:
test_data = pd.read_csv("test_data_2.csv", header=None)
test_data.columns = ['question','group']

In [26]:
len(test_data)

68

In [27]:
prediction = map(classify_muscle_group, list(test_data["question"]))
pred = pd.Series(prediction)

In [28]:
#accuracy
sum(pred==test_data["group"])/len(pred)

0.16176470588235295

In [30]:
compare = pd.concat([test_data,pred],axis=1)
pd.set_option('display.max_rows', None)
print compare

                                             question      group          0
0                        I want to train my shoulders  shoulders       back
1                                 Get wider shoulders  shoulders  shoulders
2                                    How to do pullup  shoulders       back
3                                How to do push press  shoulders       back
4                              What is shoulder press  shoulders  shoulders
5                              What is rhomboid major  shoulders  shoulders
6                                    Where is deltoid  shoulders      chest
7                         How do I train my trapezius  shoulders      chest
8                     How do  I train my rotator cuff  shoulders      chest
9                    Is rotator cuff a single muscle?  shoulders      chest
10                            How to do russian twist        abs       back
11                  What are different ways of crunch        abs       back
12          