In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline  
from sklearn import preprocessing 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from __future__ import division
from sklearn.decomposition import TruncatedSVD

In [2]:
train = pd.read_table("data_content", header=None, skip_blank_lines=False)
label = pd.read_table("label", header=None, dtype='category', skip_blank_lines=False)

In [3]:
train.columns = ['question']
label.columns = ['group']

In [4]:
print train.shape
print label.shape

(67213, 1)
(67213, 1)


In [5]:
ratio = 0.2
# Separate each muscle group data
chest = train[label['group']=='Chest']
chest_label = label[label['group']=='Chest']
chest_len = int(chest.shape[0]*ratio)

shoulder = train[label['group']=='Shoulders']
shoulder_len = int(shoulder.shape[0]*ratio)
shoulder_label = label[label['group']=='Shoulders']

back = train[label['group']=='Back']
back_label = label[label['group']=='Back']
back_len = int(back.shape[0]*ratio)

leg = train[label['group']=='Leg']
leg_label = label[label['group']=='Leg']
leg_len = int(leg.shape[0]*ratio)

tricep = train[label['group']=='Triceps']
tricep_label = label[label['group']=='Triceps']
tricep_len = int(tricep.shape[0]*ratio)

bicep = train[label['group']=='Biceps']
bicep_label = label[label['group']=='Biceps']
bicep_len = int(bicep.shape[0]*ratio)

ab = train[label['group']=='Abs']
ab_label = label[label['group']=='Abs']
ab_len = int(ab.shape[0]*ratio)

glute = train[label['group']=='Glutes']
glute_label = label[label['group']=='Glutes']
glute_len = int(glute.shape[0]*ratio)

In [6]:
# Take only the top ratio% of each group of data
train_data = pd.concat([chest[0:chest_len], shoulder[0:shoulder_len], back[0:back_len], 
                        leg[0:leg_len], tricep[0:tricep_len], bicep[0:bicep_len], ab[0:ab_len], glute[0: glute_len]])
print train_data.shape
train_label = pd.concat([chest_label[0:chest_len], shoulder_label[0:shoulder_len], back_label[0:back_len], 
                   leg_label[0:leg_len], tricep_label[0:tricep_len], bicep_label[0:bicep_len], ab_label[0:ab_len],
                         glute_label[0: glute_len]])
print train_label.shape

(13440, 1)
(13440, 1)


In [7]:
y_label = pd.Series(train_label['group'],dtype='category')
y_label.cat.categories 
y_label.cat.categories = [0,1,2,3,4,5,6,7]

In [8]:
#label binizer
lb = preprocessing.LabelBinarizer()
lb.fit([0,1,2,3,4,5,6,7])
label = lb.transform(y_label)

In [9]:
#initialize TFIDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2),stop_words="english")
data = vectorizer.fit_transform(train_data['question'])

In [15]:
# reduce features using SVD
SVD = TruncatedSVD(n_components=100, n_iter=5, random_state=0)
train = SVD.fit_transform(data)

In [27]:
# train test split
train_data, testX, train_label, testY = train_test_split(train, label, test_size=0.2, random_state=0)
print train_data.shape
print train_label.shape

(10752, 100)
(10752, 8)


In [28]:
# tensorflow setup
numFeatures = train_data.shape[1]
numLabels = 8
numEpochs = 2000
learningRate = tf.train.exponential_decay(learning_rate=0.005,
                                          global_step= 1,
                                          decay_steps=train_data.shape[0],
                                          decay_rate= 0.9,
                                          staircase=True)


X = tf.placeholder(tf.float32, [None, numFeatures])
Y = tf.placeholder(tf.float32, [None, numLabels])

weights = tf.Variable(tf.random_normal([numFeatures,numLabels],
                                       mean=0,
                                       stddev=(np.sqrt(6/numFeatures+
                                                         numLabels+1)),
                                       name="weights"))

bias = tf.Variable(tf.random_normal([1,numLabels],
                                    mean=0,
                                    stddev=(np.sqrt(6/numFeatures+numLabels+1)),
                                    name="bias"))

In [29]:
# tensorflow operation
init = tf.global_variables_initializer()

apply_weights_OP = tf.matmul(X, weights, name="apply_weights")
add_bias_OP = tf.add(apply_weights_OP, bias, name="add_bias") 
activation_OP = tf.nn.sigmoid(add_bias_OP, name="activation")

cost_OP = tf.nn.l2_loss(activation_OP-Y, name="squared_error_cost")
training_OP = tf.train.GradientDescentOptimizer(learningRate).minimize(cost_OP)

In [30]:
# tensorflow session
sess = tf.Session()
sess.run(init)

correct_predictions_OP = tf.equal(tf.argmax(activation_OP,1),tf.argmax(Y,1))
accuracy_OP = tf.reduce_mean(tf.cast(correct_predictions_OP, "float"))


cost = 0
diff = 1
epoch_values=[]
accuracy_values=[]
cost_values=[]

# Training epochs
for i in range(numEpochs):
    if i > 1 and diff < .0001:
        print("change in cost %g; convergence."%diff)
        break
    else:
        # Run training step
        step = sess.run(training_OP, feed_dict={X: train_data, Y: train_label})
        # Report occasional stats
        if i % 10 == 0:
            # Add epoch to epoch_values
            epoch_values.append(i)
            # Generate accuracy stats on test data
            train_accuracy, newCost = sess.run(
                [accuracy_OP, cost_OP], 
                feed_dict={X: train_data, Y: train_label}
            )
            # Add accuracy to live graphing variable
            accuracy_values.append(train_accuracy)
            # Add cost to live graphing variable
            cost_values.append(newCost)
            
            # Re-assign values for variables
            diff = abs(newCost - cost)
            cost = newCost

            #generate print statements
            print("step %d, training accuracy %g"%(i, train_accuracy))
            print("step %d, cost %g"%(i, newCost))
            print("step %d, change in cost %g"%(i, diff))

            

# How well do we perform on held-out test data?
print("final accuracy on test set: %s" %str(sess.run(accuracy_OP, 
                                                     feed_dict={X: testX, 
                                                                Y: testY})))

step 0, training accuracy 0.0998884
step 0, cost 10241.2
step 0, change in cost 10241.2
step 10, training accuracy 0.215309
step 10, cost 5036.02
step 10, change in cost 5205.21
step 20, training accuracy 0.218564
step 20, cost 5018.44
step 20, change in cost 17.5718
step 30, training accuracy 0.220796
step 30, cost 5001.96
step 30, change in cost 16.48
step 40, training accuracy 0.22247
step 40, cost 4985.9
step 40, change in cost 16.0693
step 50, training accuracy 0.224609
step 50, cost 4970.14
step 50, change in cost 15.7593
step 60, training accuracy 0.22526
step 60, cost 4954.7
step 60, change in cost 15.4321
step 70, training accuracy 0.227679
step 70, cost 4939.64
step 70, change in cost 15.0684
step 80, training accuracy 0.229725
step 80, cost 4925.02
step 80, change in cost 14.6123
step 90, training accuracy 0.231213
step 90, cost 4910.94
step 90, change in cost 14.083
step 100, training accuracy 0.232608
step 100, cost 4897.43
step 100, change in cost 13.5098
step 110, traini