In [18]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer

subreddit_names = ["gadgets", "sports", "gaming", "news", "history", "music", "funny", "movies", "food", "books"]

In [19]:
df = pd.read_csv('../data/train_set.csv')

# Assign an integer index to differentiate subreddit
# The purpose this is because the bayes method call requires integers
def subreddit_index(row):
    if row["subreddit"] == "gadgets":
        return 1;
    elif row["subreddit"] == "sports":
        return 2;
    elif row["subreddit"] == "gaming":
        return 3;
    elif row["subreddit"] == "news":
        return 4;
    elif row["subreddit"] == "history":
        return 5;
    elif row["subreddit"] == "music":
        return 6;
    elif row["subreddit"] == "funny":
        return 7;
    elif row["subreddit"] == "movies":
        return 8;
    elif row["subreddit"] == "food":
        return 9;
    else:
        return 10;

df["subreddit_index"] = df.apply(lambda row: subreddit_index(row), axis=1)
df = df[df.body != 'deleted']
df = df[df.comment_under_post != False]

df

Unnamed: 0,body,parent_id,subreddit,comment_under_post,subreddit_index
0,I liked the style of the 2004ish models but I...,t1_cqvz0ag,funny,True,7
1,deleted,t1_crg25oz,news,True,4
2,Either or rather I think you made a new word ...,t1_cr9egqg,books,True,10
4,Employers are not your parents If you agree o...,t1_crio5vc,news,True,4
5,I d like to hear more about these race discuss...,t1_crb6kmt,news,True,4
6,For Spanish American War Buffalo Soldiers di...,t1_cr1q2iy,movies,True,8
7,gt So to concern troll a bit How the hell d...,t1_crgq3me,news,True,4
9,Me too The guys that put liar ahead signs ...,t1_crq0vbh,gaming,True,3
10,None of that makes sense but thanks for coming...,t1_crfuy6m,news,True,4
11,She looks like a 20 year old pair of leather b...,t1_crg7kii,movies,True,8


In [20]:
#build a matrix of words and counts of those words 

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df.body)
X_train_counts

<1229236x266184 sparse matrix of type '<type 'numpy.int64'>'
	with 26498068 stored elements in Compressed Sparse Row format>

In [21]:
#Takes into account comment lenghth, transforms matrix into frequency of a particular word, not simply occurence
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1229236, 266184)

In [22]:
#fit the data and integer subreddit index to the Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train_tfidf, df.subreddit_index)
# clf = MultinomialNB().fit(X_train_counts, df.subreddit_index)

In [23]:
#lets see how accurate our model is

#first lets build a pipeline so we can train the model with one command
#basically the same thing we did above just within a pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
# text_clf = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB())])

#fit the training subset to our pipeline
text_clf = text_clf.fit(df.body, df.subreddit_index)

#read in test and add in the subreddit_index column
df_test = pd.read_csv('../data/test_set.csv')
df_test["subreddit_index"] = df.apply(lambda row: subreddit_index(row), axis=1)
df_test = df[df.body != 'deleted']
df_test = df[df.comment_under_post != False]




In [24]:
#run our model to predict the test set and compute accuracy
predicted = text_clf.predict(df_test.body)
print np.mean(predicted == df_test.subreddit_index)

0.5850853701


In [25]:
from sklearn import metrics
print(metrics.classification_report(df_test.subreddit_index, predicted, target_names = subreddit_names))


             precision    recall  f1-score   support

    gadgets       0.62      0.00      0.00     13452
     sports       0.71      0.00      0.01     27414
     gaming       0.87      0.34      0.49    180448
       news       0.58      0.81      0.68    352291
    history       0.50      0.00      0.00     13134
      music       0.50      0.70      0.59    380036
      funny       0.76      0.51      0.62    205288
     movies       0.85      0.02      0.04     24198
       food       0.89      0.01      0.03     32975

avg / total       0.65      0.59      0.55   1229236



In [26]:
df_test

Unnamed: 0,body,parent_id,subreddit,comment_under_post,subreddit_index
0,I liked the style of the 2004ish models but I...,t1_cqvz0ag,funny,True,7
1,deleted,t1_crg25oz,news,True,4
2,Either or rather I think you made a new word ...,t1_cr9egqg,books,True,10
4,Employers are not your parents If you agree o...,t1_crio5vc,news,True,4
5,I d like to hear more about these race discuss...,t1_crb6kmt,news,True,4
6,For Spanish American War Buffalo Soldiers di...,t1_cr1q2iy,movies,True,8
7,gt So to concern troll a bit How the hell d...,t1_crgq3me,news,True,4
9,Me too The guys that put liar ahead signs ...,t1_crq0vbh,gaming,True,3
10,None of that makes sense but thanks for coming...,t1_crfuy6m,news,True,4
11,She looks like a 20 year old pair of leather b...,t1_crg7kii,movies,True,8
