In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline  
from sklearn import preprocessing 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from __future__ import division
from sklearn.decomposition import TruncatedSVD
from sklearn.utils import shuffle
from tensorflow.contrib import rnn

In [2]:
train = pd.read_table("data_content", header=None, skip_blank_lines=False)
label = pd.read_table("label", header=None, dtype='category', skip_blank_lines=False)

In [3]:
train.columns = ['question']
label.columns = ['group']

In [4]:
print train.shape
print label.shape

(67213, 1)
(67213, 1)


In [4]:
ratio = 0.2
# Separate each muscle group data
chest = train[label['group']=='Chest']
chest_label = label[label['group']=='Chest']
chest_len = int(chest.shape[0]*ratio)

shoulder = train[label['group']=='Shoulders']
shoulder_len = int(shoulder.shape[0]*ratio)
shoulder_label = label[label['group']=='Shoulders']

back = train[label['group']=='Back']
back_label = label[label['group']=='Back']
back_len = int(back.shape[0]*ratio)

leg = train[label['group']=='Leg']
leg_label = label[label['group']=='Leg']
leg_len = int(leg.shape[0]*ratio)

tricep = train[label['group']=='Triceps']
tricep_label = label[label['group']=='Triceps']
tricep_len = int(tricep.shape[0]*ratio)

bicep = train[label['group']=='Biceps']
bicep_label = label[label['group']=='Biceps']
bicep_len = int(bicep.shape[0]*ratio)

ab = train[label['group']=='Abs']
ab_label = label[label['group']=='Abs']
ab_len = int(ab.shape[0]*ratio)

glute = train[label['group']=='Glutes']
glute_label = label[label['group']=='Glutes']
glute_len = int(glute.shape[0]*ratio)

In [5]:
# Take only the top ratio% of each group of data
train_data = pd.concat([chest[0:chest_len], shoulder[0:shoulder_len], back[0:back_len], 
                        leg[0:leg_len], tricep[0:tricep_len], bicep[0:bicep_len], ab[0:ab_len], glute[0: glute_len]])
print train_data.shape
train_label = pd.concat([chest_label[0:chest_len], shoulder_label[0:shoulder_len], back_label[0:back_len], 
                   leg_label[0:leg_len], tricep_label[0:tricep_len], bicep_label[0:bicep_len], ab_label[0:ab_len],
                         glute_label[0: glute_len]])
print train_label.shape

(13440, 1)
(13440, 1)


In [6]:
y_label = pd.Series(train_label['group'],dtype='category')
y_label.cat.categories 
y_label.cat.categories = [0,1,2,3,4,5,6,7]

In [7]:
#label binizer
lb = preprocessing.LabelBinarizer()
lb.fit([0,1,2,3,4,5,6,7])
label = lb.transform(y_label)

In [8]:
#initialize TFIDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2),stop_words="english")
data = vectorizer.fit_transform(train_data['question'])

In [10]:
# reduce features using SVD
SVD = TruncatedSVD(n_components=100, n_iter=5, random_state=0)
train = SVD.fit_transform(data)

In [11]:
# shuffle data
train, label = shuffle(train, label, random_state=0)

In [12]:
# train test split
train_data, testX, train_label, testY = train_test_split(train, label, test_size=0.2, random_state=0)
print train_data.shape
print train_label.shape

(10752, 100)
(10752, 8)


In [13]:
# tensorflow setup

# Parameters
learning_rate = 0.001
training_iters = 10000
batch_size = 128
display_step = 10

# Network Parameters
n_input = train_data.shape[1]
n_hidden = 128 # hidden layer num of features
n_steps = 1
n_classes = 8

# tf Graph input
x = tf.placeholder("float", [None, n_steps, n_input])
y = tf.placeholder("float", [None, n_classes])

# Define weights
weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
}
biases = {
    'out': tf.Variable(tf.random_normal([n_classes]))
}


def RNN(x, weights, biases):

    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, n_steps, n_input)
    # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)

    # Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
    x = tf.unstack(x, n_steps, 1)

    # Define a lstm cell with tensorflow
    lstm_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)

    # Get lstm cell output
    outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

pred = RNN(x, weights, biases)

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model
correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))


In [14]:
train_data = train_data.reshape(10752,1,100)

In [15]:
# tensorflow session
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)


cost_a = 0
diff = 1
epoch_values=[]
accuracy_values=[]
cost_values=[]

# Training epochs
for i in range(training_iters):
    if i > 1 and diff < .0001:
        print("change in cost %g; convergence."%diff)
        break
    else:
        # Run training step
        step = sess.run(optimizer, feed_dict={x: train_data, y: train_label})
        # Report occasional stats
        if i % 10 == 0:
            # Add epoch to epoch_values
            epoch_values.append(i)
            # Generate accuracy stats on test data
            train_accuracy, newCost = sess.run(
                [accuracy, cost], 
                feed_dict={x: train_data, y: train_label}
            )
            # Add accuracy to live graphing variable
            accuracy_values.append(train_accuracy)
            # Add cost to live graphing variable
            cost_values.append(newCost)
            
            # Re-assign values for variables
            diff = abs(newCost - cost_a)
            cost_a = newCost

            #generate print statements
            print("step %d, training accuracy %g"%(i, train_accuracy))
            print("step %d, cost %g"%(i, newCost))
            print("step %d, change in cost %g"%(i, diff))

            

# How well do we perform on held-out test data?
print("final accuracy on test set: %s" %str(sess.run(accuracy, 
                                                     feed_dict={x: testX, 
                                                                y: testY})))

step 0, training accuracy 0.0912388
step 0, cost 2.83752
step 0, change in cost 2.83752
step 10, training accuracy 0.0916109
step 10, cost 2.6932
step 10, change in cost 0.144315
step 20, training accuracy 0.0917969
step 20, cost 2.55473
step 20, change in cost 0.138469
step 30, training accuracy 0.0963542
step 30, cost 2.42269
step 30, change in cost 0.132039
step 40, training accuracy 0.118676
step 40, cost 2.30279
step 40, change in cost 0.119906
step 50, training accuracy 0.19708
step 50, cost 2.20185
step 50, change in cost 0.100939
step 60, training accuracy 0.249349
step 60, cost 2.12315
step 60, change in cost 0.078701
step 70, training accuracy 0.257161
step 70, cost 2.06384
step 70, change in cost 0.0593033
step 80, training accuracy 0.257812
step 80, cost 2.01801
step 80, change in cost 0.0458345
step 90, training accuracy 0.257626
step 90, cost 1.9817
step 90, change in cost 0.0363147
step 100, training accuracy 0.25865
step 100, cost 1.95374
step 100, change in cost 0.0279

ValueError: Cannot feed value of shape (2688, 100) for Tensor u'Placeholder:0', which has shape '(?, 1, 100)'

In [16]:
testX = testX.reshape(2688,1,100)
print("final accuracy on test set: %s" %str(sess.run(accuracy, 
                                                     feed_dict={x: testX, 
                                                                y: testY})))

final accuracy on test set: 0.0822173
