In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from __future__ import division



In [3]:
train = pd.read_table("train_data", header=None)
label = pd.read_table("label", header=None, dtype='category')

In [4]:
train.columns = ['question']
label.columns = ['group']

In [5]:
print train.shape
print label.shape

(67212, 1)
(67212, 1)


In [28]:
# Separate each muscle group data
chest = train[label['group']=='Chest']
chest_label = label[label['group']=='Chest']
chest_len = int(chest.shape[0]*0.6)

shoulder = train[label['group']=='Shoulders']
shoulder_len = int(shoulder.shape[0]*0.6)
shoulder_label = label[label['group']=='Shoulders']

back = train[label['group']=='Back']
back_label = label[label['group']=='Back']
back_len = int(back.shape[0]*0.6)

leg = train[label['group']=='Leg']
leg_label = label[label['group']=='Leg']
leg_len = int(leg.shape[0]*0.6)

tricep = train[label['group']=='Triceps']
tricep_label = label[label['group']=='Triceps']
tricep_len = int(tricep.shape[0]*0.6)

bicep = train[label['group']=='Biceps']
bicep_label = label[label['group']=='Biceps']
bicep_len = int(bicep.shape[0]*0.6)

ab = train[label['group']=='Abs']
ab_label = label[label['group']=='Abs']
ab_len = int(ab.shape[0]*0.6)

glute = train[label['group']=='Glutes']
glute_label = label[label['group']=='Glutes']
glute_len = int(glute.shape[0]*0.6)

In [29]:
# Take only the top 60% of each group of data
train_data = pd.concat([chest[0:chest_len], shoulder[0:shoulder_len], back[0:back_len], 
                        leg[0:leg_len], tricep[0:tricep_len], bicep[0:bicep_len], ab[0:ab_len], glute[0: glute_len]])
print train_data.shape
train_label = pd.concat([chest_label[0:chest_len], shoulder_label[0:shoulder_len], back_label[0:back_len], 
                   leg_label[0:leg_len], tricep_label[0:tricep_len], bicep_label[0:bicep_len], ab_label[0:ab_len],
                         glute_label[0: glute_len]])
print train_label.shape

(40325, 1)
(40325, 1)


In [81]:
label['group'].value_counts()

Back         16995
Leg          10390
Shoulders     8809
Chest         8685
Triceps       6864
Biceps        6798
Abs           6046
Glutes        2625
Name: group, dtype: int64

In [30]:
#initialize TFIDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
data = vectorizer.fit_transform(train_data['question'])

In [34]:
feature_names[1000:1050]

array([u'2003', u'2003 arnold', u'2003 arnold classic',
       u'2003 arnold weekend', u'2003 australian',
       u'2003 australian grand', u'2003 body', u'2003 body for',
       u'2003 emerald', u'2003 emerald cup', u'2003 ipa',
       u'2003 ipa worlds', u'2003 ironman', u'2003 ironman pro',
       u'2003 maximus', u'2003 maximus pro', u'2003 mr',
       u'2003 mr olympia', u'2003 night', u'2003 night of', u'2003 noc',
       u'2003 noc preview', u'2003 olympia', u'2003 olympia bikini',
       u'2003 olympia coverage', u'2003 olympia experience',
       u'2003 olympia expo', u'2003 olympia report', u'2003 san',
       u'2003 san francisco', u'2003 show', u'2003 show of', u'2003 usa',
       u'2003 usa naturals', u'2003 video', u'2003 video coverage',
       u'2003 week', u'2003 western', u'2003 western collegiate', u'2004',
       u'2004 arnold', u'2004 arnold classic', u'2004 arnold expo',
       u'2004 arnold weekend', u'2004 boise', u'2004 boise fitness',
       u'2004 fame', u'20

In [40]:
# separate data into training and test
X_train, X_test, y_train, y_test = train_test_split(data, train_label, test_size=0.2, random_state=0)
print X_train.shape
print X_test.shape

(32260, 84357)
(8065, 84357)


In [32]:
feature_names = vectorizer.get_feature_names()
feature_names = np.asarray(feature_names)
feature_names.shape

(84357,)

In [41]:
#benchmark model
clf = RandomForestClassifier(max_depth=4)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

  app.launch_new_instance()


In [42]:
score = metrics.accuracy_score(y_test, pred)
print score

0.244265344079


In [43]:
y_label = pd.Series(y_train['group'],dtype='category')
y_label.cat.categories 
y_label.cat.categories = [0,1,2,3,4,5,6,7]

In [44]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_label, test_size=0.2, random_state=0)
print X_train.shape
print y_train.shape
print X_valid.shape
print y_valid.shape

(25808, 84357)
(25808,)
(6452, 84357)
(6452,)


In [38]:
y_valid

[7, 1, 2, 1, 7, ..., 1, 6, 3, 5, 1]
Length: 6452
Categories (8, int64): [0, 1, 2, 3, 4, 5, 6, 7]

In [45]:
#xgboost

params = {}
params['objective'] = 'multi:softmax'
params['eta'] = 0.1
params['max_depth'] = 6
params['num_class'] = 8

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 200, watchlist, early_stopping_rounds=50, verbose_eval=10)
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test)


[0]	train-merror:0.732137	valid-merror:0.748295
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 50 rounds.
[10]	train-merror:0.728185	valid-merror:0.751395
[20]	train-merror:0.731052	valid-merror:0.747985
[30]	train-merror:0.731789	valid-merror:0.74535
[40]	train-merror:0.731207	valid-merror:0.744885
[50]	train-merror:0.729967	valid-merror:0.746125
[60]	train-merror:0.729464	valid-merror:0.747055
[70]	train-merror:0.728534	valid-merror:0.747985
[80]	train-merror:0.727449	valid-merror:0.74876
[90]	train-merror:0.726054	valid-merror:0.750465
Stopping. Best iteration:
[40]	train-merror:0.731207	valid-merror:0.744885



In [46]:
y_test = pd.Series(y_test['group'],dtype='category')
y_test.cat.categories = [0,1,2,3,4,5,6,7]

accuracy = float(np.sum(p_test == y_test) / y_test.shape[0])
print accuracy

0.235089894606
