In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from __future__ import division



In [3]:
train = pd.read_table("train_data", header=None)
label = pd.read_table("label", header=None, dtype='category')

In [4]:
train.columns = ['question']
label.columns = ['group']

In [5]:
print train.shape
print label.shape

(67212, 1)
(67212, 1)


In [15]:
# Separate each muscle group data
chest = train[label['group']=='Chest']
chest_len = int(chest.shape[0]*0.6)
shoulder = train[label['group']=='Shoulders']
shoulder_len = int(shoulder.shape[0]*0.6)
back = train[label['group']=='Back']
back_len = int(back.shape[0]*0.6)
leg = train[label['group']=='Leg']
leg_len = int(leg.shape[0]*0.6)
tricep = train[label['group']=='Triceps']
tricep_len = int(tricep.shape[0]*0.6)
bicep = train[label['group']=='Biceps']
bicep_len = int(bicep.shape[0]*0.6)
ab = train[label['group']=='Abs']
ab_len = int(ab.shape[0]*0.6)
glute = train[label['group']=='Glutes']
glute_len = int(glute.shape[0]*0.6)

In [16]:
# Take only the top 60% of each group of data
train_data = pd.concat([chest[0:chest_len], shoulder[0:shoulder_len], back[0:back_len], leg[0:leg_len], tricep[0:tricep_len], bicep[0:bicep_len], ab[0:ab_len], glute[0: glute_len]])
print train_data.shape

(40325, 1)


In [81]:
label['group'].value_counts()

Back         16995
Leg          10390
Shoulders     8809
Chest         8685
Triceps       6864
Biceps        6798
Abs           6046
Glutes        2625
Name: group, dtype: int64

In [40]:
#initialize TFIDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
data = vectorizer.fit_transform(train['question'])


In [159]:
feature_names[1000:1100]

array([u'16 dieting strategy', u'16 get', u'16 get inspired', u'16 heavy',
       u'16 heavy metal', u'16 inspiring', u'16 inspiring packs',
       u'16 laws', u'16 laws of', u'16 lbs', u'16 leg', u'16 leg training',
       u'16 mid', u'16 mid section', u'16 most', u'16 most advanced',
       u'16 of', u'16 of his', u'16 percent', u'16 pounds',
       u'16 pounds of', u'16 rest', u'16 shoulder', u'16 shoulder crunch',
       u'16 shoulders', u'16 shoulders triceps', u'16 shredded',
       u'16 shredded selfies', u'16 tips', u'16 tips to', u'16 training',
       u'16 training the', u'16 transformation',
       u'16 transformation titans', u'16 upper', u'16 upper body',
       u'16 ways', u'16 ways to', u'16 week', u'16 week metamorphosis',
       u'16 week muscle', u'16 week transformation', u'16 weeks',
       u'16 weeks of', u'16 weeks out', u'165', u'165 pounds',
       u'165 pounds through', u'16th', u'16th birthday',
       u'16th birthday gift', u'17', u'17 abdominal',
       u'17

In [150]:
# separate data into training and test
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0)
print X_train.shape
print X_test.shape

(53769, 111045)
(13443, 111045)


In [44]:
feature_names = vectorizer.get_feature_names()
feature_names = np.asarray(feature_names)
feature_names.shape

(111045,)

In [46]:
#benchmark model
clf = RandomForestClassifier(max_depth=4)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

  app.launch_new_instance()


In [50]:
score = metrics.accuracy_score(y_test, pred)
print score

0.248902774678


In [117]:
y_label = pd.Series(y_train['group'],dtype='category')
y_label.cat.categories 
y_label.cat.categories = [0,1,2,3,4,5,6,7]

In [118]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_label, test_size=0.2, random_state=0)
print X_train.shape
print y_train.shape
print X_valid.shape
print y_valid.shape

(43015, 111045)
(43015,)
(10754, 111045)
(10754,)


In [120]:
y_valid

[6, 5, 4, 0, 5, ..., 3, 6, 5, 6, 6]
Length: 10754
Categories (8, int64): [0, 1, 2, 3, 4, 5, 6, 7]

In [148]:
#xgboost

params = {}
params['objective'] = 'multi:softmax'
params['eta'] = 0.1
params['max_depth'] = 6
params['num_class'] = 8

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 200, watchlist, early_stopping_rounds=50, verbose_eval=10)
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test)


[0]	train-merror:0.73665	valid-merror:0.758137
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 50 rounds.
[10]	train-merror:0.735441	valid-merror:0.758044
[20]	train-merror:0.736022	valid-merror:0.756928
[30]	train-merror:0.736743	valid-merror:0.755812
[40]	train-merror:0.736627	valid-merror:0.755161
[50]	train-merror:0.736046	valid-merror:0.755998
[60]	train-merror:0.735418	valid-merror:0.756928
[70]	train-merror:0.734976	valid-merror:0.757114
[80]	train-merror:0.734488	valid-merror:0.757951
Stopping. Best iteration:
[38]	train-merror:0.736952	valid-merror:0.754975



In [151]:
y_test = pd.Series(y_test['group'],dtype='category')
y_test.cat.categories = [0,1,2,3,4,5,6,7]

accuracy = float(np.sum(p_test == y_test) / y_test.shape[0])
print accuracy

0.242877333928
