In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics

subreddit_names = ["gadgets", "sports", "gaming", "news", "history", "music", "funny", "movies", "food", "books"]
labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [3]:
df = pd.read_csv('../data/toy_train_set.csv')

# Assign an integer index to differentiate subreddit
# The purpose this is because the bayes method call requires integers
def subreddit_index(row):
    if row["subreddit"] == "gadgets":
        return 1;
    elif row["subreddit"] == "sports":
        return 2;
    elif row["subreddit"] == "gaming":
        return 3;
    elif row["subreddit"] == "news":
        return 4;
    elif row["subreddit"] == "history":
        return 5;
    elif row["subreddit"] == "music":
        return 6;
    elif row["subreddit"] == "funny":
        return 7;
    elif row["subreddit"] == "movies":
        return 8;
    elif row["subreddit"] == "food":
        return 9;
    else:
        return 10;

df["subreddit_index"] = df.apply(lambda row: subreddit_index(row), axis=1)

#I did not put this filtering in my Bag_Of_Words function
df = df[df.comment_under_post != False]

df

Unnamed: 0,body,parent_id,subreddit,comment_under_post,subreddit_index
1,deleted,t1_cqzjtdg,gadgets,True,1
5,There s more going on The radio receives a ti...,t1_cqy5taf,gadgets,True,1
9,deleted,t1_crb5a53,news,True,4
10,deleted,t1_crdptvf,funny,True,7
11,I ma print this quote on laminated cards and h...,t1_crgmjv2,news,True,4
12,Also if you fail and get into an accident you ...,t1_crcd5af,gaming,True,3
13,He was great in Kingsman,t1_cr2vsm1,movies,True,8
15,Horny isn t an accurate comparison since the...,t1_crj5jcu,funny,True,7
17,Optional SBS configuration with a Salvo 12 Th...,t1_cqxnddd,news,True,4
18,You didn t even you didn t even watch th...,t1_cr329yc,news,True,4


In [4]:
#build a matrix of words and counts of those words 

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df.body)
X_train_counts.shape



(1367, 6798)

In [5]:
#Takes into account comment lenghth, transforms matrix into frequency of a particular word, not simply occurence
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1367, 6798)

In [6]:
#fit the subreddit comments and integer subreddit index to the Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train_tfidf, df.subreddit_index)
# clf = MultinomialNB().fit(X_train_counts, df.subreddit_index)

In [7]:
#lets see how accurate our model is

#first lets build a pipeline so we can train the model with one command
#basically the same thing we did above just within a pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
# text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])

#fit the training subset to our pipeline
text_clf = text_clf.fit(df.body, df.subreddit_index)

#read in test and add in the subreddit_index column
df_test = pd.read_csv('../data/toy_test_set.csv')
df_test["subreddit_index"] = df_test.apply(lambda row: subreddit_index(row), axis=1)
# df_test = df[df.body != 'deleted']
df_test = df_test[df_test.comment_under_post != False]
#run our model to predict the test set and compute accuracy
predicted = text_clf.predict(df_test.body)
print np.mean(predicted == df_test.subreddit_index)

0.410938654841


In [26]:
print(metrics.classification_report(df_test.subreddit_index, predicted, labels = labels, target_names = subreddit_names))


             precision    recall  f1-score   support

    gadgets       0.00      0.00      0.00        17
     sports       0.00      0.00      0.00        32
     gaming       1.00      0.07      0.13       203
       news       0.61      0.98      0.75       384
    history       0.00      0.00      0.00        16
      music       0.00      0.00      0.00         0
      funny       0.58      0.92      0.71       445
     movies       1.00      0.13      0.24       216
       food       0.00      0.00      0.00        26
      books       0.00      0.00      0.00        28

avg / total       0.67      0.61      0.50      1367

