In [1]:
#import libs
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,KFold

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB


  from numpy.core.umath_tests import inner1d


In [2]:
#read in cleaned data that we created from 'Project_3_start.ipynb'
data = pd.read_csv('./data/reddit_final.csv')

In [3]:
data.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
#check for null values and remove them
data.isnull().sum()

title        10
subreddit     0
dtype: int64

In [5]:
#drop nulls
data.dropna(axis=0,inplace=True)

In [6]:
data.head()

Unnamed: 0,title,subreddit
0,on soundcloud you have more listens than follo...,Showerthoughts
1,taking mdma is is just borrowing energy from t...,Showerthoughts
2,being lonely is receiving a notification and t...,Showerthoughts
3,why are all heartbreak quotes about women,Showerthoughts
4,life would be very boring if we couldn’t talk ...,Showerthoughts


In [7]:
#check our class distribution
data['subreddit'].value_counts(normalize=True)

Showerthoughts    0.50015
The_Donald        0.49985
Name: subreddit, dtype: float64

In [8]:
#Set shower thoughts as target 1, trump as 0
targets = {'Showerthoughts':1,'The_Donald':0}

In [9]:
data['target'] = data['subreddit'].map(targets)

In [10]:
data.head()

Unnamed: 0,title,subreddit,target
0,on soundcloud you have more listens than follo...,Showerthoughts,1
1,taking mdma is is just borrowing energy from t...,Showerthoughts,1
2,being lonely is receiving a notification and t...,Showerthoughts,1
3,why are all heartbreak quotes about women,Showerthoughts,1
4,life would be very boring if we couldn’t talk ...,Showerthoughts,1


In [11]:
#recheck our value counts after setting our target
data.target.value_counts()

1    9998
0    9992
Name: target, dtype: int64

In [12]:
#train,test,split, and Countvect for train and test

In [13]:
X = data['title']
y = data['target']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=37,stratify=y)

In [15]:
cv = CountVectorizer(stop_words='english',max_features=1000)

In [16]:
train_cv = cv.fit_transform(X_train)

In [17]:
test_cv = cv.transform(X_test)

In [18]:
print(train_cv.shape)
print(test_cv.shape)

(14992, 1000)
(4998, 1000)


In [19]:
kf = KFold(n_splits=3,shuffle=True)

In [20]:
#try logistic regression model:
lrg = LogisticRegression(penalty='l1',C=1)

In [21]:
#check our cross val accuracy score; '.toarray' to expand the sparse matrix
cross_val_score(lrg,train_cv.toarray(),y_train,cv=kf).mean()

0.8259731882880562

In [22]:
#fit our model
lrg.fit(train_cv.toarray(),y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
#check accuracy score on train data and test data

In [24]:
lrg.score(train_cv.toarray(),y_train)

0.862259871931697

In [25]:
lrg.score(test_cv.toarray(),y_test)

0.8329331732693077

In [26]:
df = pd.DataFrame(train_cv.toarray(),columns=cv.get_feature_names())

In [27]:
df.head()

Unnamed: 0,able,absolutely,account,act,acting,action,actual,actually,ad,administration,...,written,wrong,wtf,year,years,yellow,yes,york,young,youtube
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
train_cv.toarray().sum(axis=0);

In [29]:
#try random forest
rf = RandomForestClassifier()

In [30]:
cross_val_score(rf,train_cv.toarray(),y_train,cv=kf).mean()

0.8010272883751592

In [31]:
rf.fit(train_cv.toarray(),y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [32]:
print(rf.score(train_cv.toarray(),y_train))
print(rf.score(test_cv.toarray(),y_test))

0.9538420490928495
0.8053221288515406


In [33]:
nb = MultinomialNB()

In [34]:
cross_val_score(nb,train_cv.toarray(),y_train,cv=kf).mean()

0.8243731352857333

In [36]:
rf_params = {
    'n_estimators': [250,300,450],
    #'max_depth': [200,400,800],
    

    
    
}

In [37]:
#trying out tuning of random forrest

In [38]:
gs = GridSearchCV(rf,param_grid=rf_params,n_jobs=-2)

gs.fit(train_cv.toarray(), y_train)
print(gs.best_score_)
gs.best_params_

0.8086979722518677


{'n_estimators': 250}