In [2]:
import pandas as pd
import numpy as np
import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

## Reading the message dataset
The first thing we are going to do is to read in the dataset using the Pandas read_csv() function. We will put this data into a Pandas DataFrame, called "message_data".

In [14]:
message_data = pd.read_csv("message-data.csv")
message_data.head(5)

Unnamed: 0,message,category
0,what are the pre and post workout stretches,workout related
1,How was my day today?,generic statement
2,How many glasses of water should I drink,diet related
3,How is my protien intake today?,diet related
4,Is Glass noodles better or wheat noodles?,diet related


In [15]:
message_data['category'].value_counts()

diet related                     142
workout related                   64
generic statement                 44
fitness concern and queries       30
fitness expert related             8
plan and subscription related      7
Name: category, dtype: int64

## Data Description:
Message - Message text sent by the user

category - Target classification category of the mesaage. I have manually tagged the category for this dataset. The categories are as below
- diet related
- workout related
- fitness expert related
- plan and subscription related
- fitness concern and queries
- generic statement

## Removing punctuations and stopwords from the message text saving the clean text in message-clean column of the dataframe

In [5]:
stop_words = set(stopwords.words('english'))

# removing punctuations 
def remove_punctuation(cols):
    text = cols[0]
    return text.translate(None, string.punctuation)

# removing stopwords 
def remove_stopword(cols):
    text = cols[0]
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w in stop_words]
    return " ".join(filtered_text)

# sample_text = "This is a sample sentence, showing off the stop words filtration"
# filtered_text = remove_punctuation(sample_text)
# print filtered_text
# filtered_text = remove_stopword(filtered_text)
# print filtered_text

# putting the clean text in 'message-clean' column
message_data['message-clean'] = message_data[['message']].apply(remove_punctuation, axis=1)
message_data['message-clean'] = message_data[['message-clean']].apply(remove_stopword, axis=1)
message_data.head(5)

Unnamed: 0,message,category,message-clean
0,what are the pre and post workout stretches,workout related,pre post workout stretches
1,How was my day today?,generic statement,How day today
2,How many glasses of water should I drink,diet related,How many glasses water I drink
3,How is my protien intake today?,diet related,How protien intake today
4,Is Glass noodles better or wheat noodles?,diet related,Is Glass noodles better wheat noodles


## Training the message clasifier model and testing the trained model

In [6]:
# transforning the clean message text to tf-idf matrix
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()
# Stochastic Gradient Decent used to build the classifier model
clf= SGDClassifier(loss='hinge',penalty ='l2', alpha = 1e-4, n_iter=5)

# training the model
def train_message_classifier(data_train):
    print "message classification training started...."
    train_text = [i[0] for i in data_train[['message-clean']].values]
    train_category = [i[0] for i in data_train[['category']].values]
    
    train_counts = count_vect.fit_transform(train_text)
    train_tfidf = tfidf_transformer.fit_transform(train_counts)
    clf.fit(train_tfidf, train_category)
    print "message classification training completed...."

# testing the model   
def test_message_classifier(data_test):
    print "message classification testing started...."
    test_text = [i[0] for i in data_test[['message-clean']].values]
    test_category = [i[0] for i in data_test[['category']].values]
        
    test_counts = count_vect.transform(test_text)
    test_tfidf = tfidf_transformer.transform(test_counts)
    predicted = clf.predict(test_tfidf)

    # printing the performance of the classifier on test data
    print "Mean Accuracy :"
    print np.mean(predicted == test_category)
    print "Accuracy Score : "
    print metrics.accuracy_score(test_category,predicted)

    print "Classification report : "
    print metrics.classification_report(test_category, predicted)
    print "Confusion Matrix"
    print metrics.confusion_matrix(test_category, predicted)

    print "message classification testing completed...."

In [7]:
# Random splitting of the dataset to training and test
train, test = train_test_split(message_data, test_size = 0.2)

print "training dataset: " + str(len(train))
print "test dataset: " + str(len(test))

train_message_classifier(train)
test_message_classifier(test)

training dataset: 236
test dataset: 59
message classification training started....
message classification training completed....
message classification testing started....
Mean Accuracy :
0.796610169492
Accuracy Score : 
0.796610169492
Classification report : 
                               precision    recall  f1-score   support

                 diet related       0.82      0.97      0.89        29
  fitness concern and queries       0.86      0.67      0.75         9
       fitness expert related       0.00      0.00      0.00         1
            generic statement       1.00      0.73      0.84        11
plan and subscription related       0.00      0.00      0.00         1
              workout related       0.56      0.62      0.59         8

                  avg / total       0.80      0.80      0.79        59

Confusion Matrix
[[28  0  0  0  1  0]
 [ 0  6  0  0  0  3]
 [ 0  0  0  0  0  1]
 [ 3  0  0  8  0  0]
 [ 0  1  0  0  0  0]
 [ 3  0  0  0  0  5]]
message classification t

  'precision', 'predicted', average, warn_for)
