In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from __future__ import division



In [3]:
train = pd.read_table("train_data", header=None)
label = pd.read_table("label", header=None, dtype='category')

In [4]:
train.columns = ['question']
label.columns = ['group']

In [5]:
print train.shape
print label.shape

(67212, 1)
(67212, 1)


In [28]:
# Separate each muscle group data
chest = train[label['group']=='Chest']
chest_label = label[label['group']=='Chest']
chest_len = int(chest.shape[0]*0.6)

shoulder = train[label['group']=='Shoulders']
shoulder_len = int(shoulder.shape[0]*0.6)
shoulder_label = label[label['group']=='Shoulders']

back = train[label['group']=='Back']
back_label = label[label['group']=='Back']
back_len = int(back.shape[0]*0.6)

leg = train[label['group']=='Leg']
leg_label = label[label['group']=='Leg']
leg_len = int(leg.shape[0]*0.6)

tricep = train[label['group']=='Triceps']
tricep_label = label[label['group']=='Triceps']
tricep_len = int(tricep.shape[0]*0.6)

bicep = train[label['group']=='Biceps']
bicep_label = label[label['group']=='Biceps']
bicep_len = int(bicep.shape[0]*0.6)

ab = train[label['group']=='Abs']
ab_label = label[label['group']=='Abs']
ab_len = int(ab.shape[0]*0.6)

glute = train[label['group']=='Glutes']
glute_label = label[label['group']=='Glutes']
glute_len = int(glute.shape[0]*0.6)

In [29]:
# Take only the top 60% of each group of data
train_data = pd.concat([chest[0:chest_len], shoulder[0:shoulder_len], back[0:back_len], 
                        leg[0:leg_len], tricep[0:tricep_len], bicep[0:bicep_len], ab[0:ab_len], glute[0: glute_len]])
print train_data.shape
train_label = pd.concat([chest_label[0:chest_len], shoulder_label[0:shoulder_len], back_label[0:back_len], 
                   leg_label[0:leg_len], tricep_label[0:tricep_len], bicep_label[0:bicep_len], ab_label[0:ab_len],
                         glute_label[0: glute_len]])
print train_label.shape

(40325, 1)
(40325, 1)


In [81]:
label['group'].value_counts()

Back         16995
Leg          10390
Shoulders     8809
Chest         8685
Triceps       6864
Biceps        6798
Abs           6046
Glutes        2625
Name: group, dtype: int64

In [64]:
#initialize TFIDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 4),stop_words="english")
data = vectorizer.fit_transform(train_data['question'])

In [65]:
print vectorizer.get_stop_words()

frozenset(['all', 'six', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'fifty', 'four', 'not', 'own', 'through', 'yourselves', 'go', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'nei

In [68]:
# show top important words
indices = np.argsort(vectorizer.idf_)[::-1]
features = vectorizer.get_feature_names()
top_n = 15
top_features = [features[i] for i in indices[:top_n]]
print top_features

[u'killer quad day', u'need lose', u'need know gluten', u'need know glutamine', u'need know begin', u'need know anabolic steroids', u'need know anabolic', u'bodybuilder week augie kryger', u'need gym bag', u'need gym', u'need fix right', u'need fix', u'need evaluation', u'need eat loads food', u'need eat loads']


In [69]:
# separate data into training and test
X_train, X_test, y_train, y_test = train_test_split(data, train_label, test_size=0.2, random_state=0)
print X_train.shape
print X_test.shape

(32260, 89989)
(8065, 89989)


In [70]:
feature_names = vectorizer.get_feature_names()
feature_names = np.asarray(feature_names)
feature_names.shape

(89989,)

In [76]:
#benchmark model
clf = RandomForestClassifier(max_depth=4)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

In [77]:
score = metrics.accuracy_score(y_test, pred)
print score

0.244265344079


In [71]:
y_label = pd.Series(y_train['group'],dtype='category')
y_label.cat.categories 
y_label.cat.categories = [0,1,2,3,4,5,6,7]

In [72]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_label, test_size=0.2, random_state=0)
print X_train.shape
print y_train.shape
print X_valid.shape
print y_valid.shape

(25808, 89989)
(25808,)
(6452, 89989)
(6452,)


In [73]:
y_valid

[7, 1, 2, 1, 7, ..., 1, 6, 3, 5, 1]
Length: 6452
Categories (8, int64): [0, 1, 2, 3, 4, 5, 6, 7]

In [74]:
#xgboost

params = {}
params['objective'] = 'multi:softmax'
params['eta'] = 0.1
params['max_depth'] = 6
params['num_class'] = 8

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
bst = xgb.train(params, d_train, 200, watchlist, early_stopping_rounds=50, verbose_eval=10)
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test)


[0]	train-merror:0.735237	valid-merror:0.753875
Multiple eval metrics have been passed: 'valid-merror' will be used for early stopping.

Will train until valid-merror hasn't improved in 50 rounds.
[10]	train-merror:0.732292	valid-merror:0.751395
[20]	train-merror:0.734772	valid-merror:0.747365
[30]	train-merror:0.736012	valid-merror:0.744885
[40]	train-merror:0.735896	valid-merror:0.744885
[50]	train-merror:0.735315	valid-merror:0.745195
[60]	train-merror:0.734578	valid-merror:0.745815
[70]	train-merror:0.733377	valid-merror:0.747365
[80]	train-merror:0.732215	valid-merror:0.74814
Stopping. Best iteration:
[33]	train-merror:0.735896	valid-merror:0.74473



In [75]:
y_test = pd.Series(y_test['group'],dtype='category')
y_test.cat.categories = [0,1,2,3,4,5,6,7]

accuracy = float(np.sum(p_test == y_test) / y_test.shape[0])
print accuracy

0.2332300062
