In [65]:
import pandas as pd
import numpy as np
import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [66]:
data = pd.read_csv("message-data.csv")
data.head(5)

Unnamed: 0,message,category
0,what are the pre and post workout stretches,workout related
1,How was my day today?,generic statement
2,How many glasses of water should I drink,diet related
3,How is my protien intake today?,diet related
4,Is Glass noodles better or wheat noodles?,diet related


In [67]:
stop_words = set(stopwords.words('english'))

def remove_punctuation(cols):
    text = cols[0]
    return text.translate(None, string.punctuation)

def remove_stopword(cols):
    text = cols[0]
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w in stop_words]
    return " ".join(filtered_text)

# sample_text = "This is a sample sentence, showing off the stop words filtration"
# filtered_text = remove_punctuation(sample_text)
# print filtered_text
# filtered_text = remove_stopword(filtered_text)
# print filtered_text

data['message-clean'] = data[['message']].apply(remove_punctuation, axis=1)
data['message-clean'] = data[['message-clean']].apply(remove_stopword, axis=1)
data.head(5)

Unnamed: 0,message,category,message-clean
0,what are the pre and post workout stretches,workout related,pre post workout stretches
1,How was my day today?,generic statement,How day today
2,How many glasses of water should I drink,diet related,How many glasses water I drink
3,How is my protien intake today?,diet related,How protien intake today
4,Is Glass noodles better or wheat noodles?,diet related,Is Glass noodles better wheat noodles


In [60]:
# train, test = train_test_split(data, test_size = 0.2)

# train.to_csv('message-data-train.csv', index=False)
# test.to_csv('message-data-test.csv', index=False)

# train = pd.read_csv('message-data-train.csv')
# test = pd.read_csv('message-data-test.csv')

# print train.head(5)
# print test.head(5)

In [68]:
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()
clf= SGDClassifier(loss='hinge',penalty ='l2', alpha = 1e-4, n_iter=5)

def train_message_classifier(data_train):
    print "message classification training started...."
    train_text = [i[0] for i in data_train[['message-clean']].values]
    train_category = [i[0] for i in data_train[['category']].values]
    
 
    train_counts = count_vect.fit_transform(train_text)
    train_tfidf = tfidf_transformer.fit_transform(train_counts)
    clf.fit(train_tfidf, train_category)
    print "message classification training completed...."

    
def test_message_classifier(data_test):
    print "message classification testing started...."
    test_text = [i[0] for i in data_test[['message-clean']].values]
    test_category = [i[0] for i in data_test[['category']].values]
        
    test_counts = count_vect.transform(test_text)
    test_tfidf = tfidf_transformer.transform(test_counts)
    predicted = clf.predict(test_tfidf)

    print "Mean Accuracy :"
    print np.mean(predicted == test_category)
    print "Accuracy Score : "
    print metrics.accuracy_score(test_category,predicted)

    print "Classification report : "
    print metrics.classification_report(test_category, predicted)
    print "Confusion Matrix"
    print metrics.confusion_matrix(test_category, predicted)

    print "message classification testing completed...."

#     data_test_pred = data_test
#     data_test_pred['predicted-category'] = predicted
#     outputFile = 'message-data-test.csv'
#     data_test_pred.to_csv(outputFile, index=False)

In [70]:
train, test = train_test_split(data, test_size = 0.2)

print "training dataset: " + str(len(train))
print "test dataset: " + str(len(test))

train_message_classifier(train)
test_message_classifier(test)

training dataset: 236
test dataset: 59
message classification training started....
message classification training completed....
message classification testing started....
Mean Accuracy :
0.847457627119
Accuracy Score : 
0.847457627119
Classification report : 
                               precision    recall  f1-score   support

                 diet related       0.88      0.91      0.90        33
  fitness concern and queries       1.00      0.67      0.80         6
       fitness expert related       1.00      0.50      0.67         2
            generic statement       1.00      0.75      0.86         8
plan and subscription related       1.00      1.00      1.00         1
              workout related       0.62      0.89      0.73         9

                  avg / total       0.88      0.85      0.85        59

Confusion Matrix
[[30  0  0  0  0  3]
 [ 1  4  0  0  0  1]
 [ 0  0  1  0  0  1]
 [ 2  0  0  6  0  0]
 [ 0  0  0  0  1  0]
 [ 1  0  0  0  0  8]]
message classification t