In [1]:
#trying out removing of possible identifying keywords from posts such as "trump' or 'shower'

In [3]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,KFold

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB


In [4]:
data = pd.read_csv('./data/reddit_final.csv')

In [5]:
data.drop('Unnamed: 0',axis=1,inplace=True)

In [6]:
data.isnull().sum()

title        10
subreddit     0
dtype: int64

In [7]:
data.dropna(axis=0,inplace=True)

In [8]:
data.head()

Unnamed: 0,title,subreddit
0,on soundcloud you have more listens than follo...,Showerthoughts
1,taking mdma is is just borrowing energy from t...,Showerthoughts
2,being lonely is receiving a notification and t...,Showerthoughts
3,why are all heartbreak quotes about women,Showerthoughts
4,life would be very boring if we couldn’t talk ...,Showerthoughts


In [9]:
data['subreddit'].value_counts(normalize=True)

Showerthoughts    0.50015
The_Donald        0.49985
Name: subreddit, dtype: float64

In [10]:
#Set shower thoughts as target 1, trump as 0
targets = {'Showerthoughts':1,'The_Donald':0}

In [11]:
data['target'] = data['subreddit'].map(targets)

In [12]:
data['target'].value_counts(normalize=True)

1    0.50015
0    0.49985
Name: target, dtype: float64

In [13]:
data[['target','subreddit']].head()

Unnamed: 0,target,subreddit
0,1,Showerthoughts
1,1,Showerthoughts
2,1,Showerthoughts
3,1,Showerthoughts
4,1,Showerthoughts


In [14]:
#checking on the keywords in each subreddit catagory

In [15]:
data[data['title'].str.contains("shower")].head()

Unnamed: 0,title,subreddit,target
13,even a random sentance can be a showerthought ...,Showerthoughts,1
23,people who are truely devoted to showerthought...,Showerthoughts,1
32,people who are truly devoted to showerthoughts...,Showerthoughts,1
41,showers are domesticated rain,Showerthoughts,1
76,after i shower for the evening and put my paja...,Showerthoughts,1


In [16]:
data[(data['subreddit']=='The_Donald') & (data['title'].str.contains('trump'))].head()

Unnamed: 0,title,subreddit,target
10000,get free trump logos on your golf balls,The_Donald,0
10007,stephen miller on face the nation: trump absol...,The_Donald,0
10011,trump sings hello darling by conway twitty,The_Donald,0
10015,trump should have spaceforce look into this la...,The_Donald,0
10031,fitton:testifies on clinton foundation hillary...,The_Donald,0


In [17]:
#remove Trump,trump, and shower related terms

In [18]:
data['title'] = data['title'].map(
    lambda x: re.sub(r'([Ss]hower|[Ss]howerthoughts|[Tt]rump|[Dd]onald)','',x)
)

In [19]:
data[data['title'].str.contains('trump')]

Unnamed: 0,title,subreddit,target


In [20]:
X = data['title']
y = data['target']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=37,stratify=y)

In [22]:
kf = KFold(n_splits=3,shuffle=True)

In [23]:
cv = CountVectorizer(stop_words='english',max_features=6000,ngram_range=(1,3))

In [24]:
train_cv = cv.fit_transform(X_train)

In [25]:
test_cv = cv.transform(X_test)

In [26]:
print(train_cv.shape)
print(test_cv.shape)

(14992, 6000)
(4998, 6000)


In [27]:
lrg = LogisticRegression(penalty='l2',C=1)

In [28]:
cross_val_score(lrg,train_cv.toarray(),y_train,cv=kf).mean()

0.8589913985739717

In [29]:
lrg.fit(train_cv.toarray(),y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
lrg.score(train_cv.toarray(),y_train)

0.9387673425827108

In [31]:
lrg.score(test_cv.toarray(),y_test)

0.8711484593837535

In [32]:
df = pd.DataFrame(train_cv.toarray(),columns=cv.get_feature_names())

In [34]:
df.head()

Unnamed: 0,abbreviation,abc,ability,able,abortion,abrams,absolute,absolute best,absolutely,absurd,...,yup yup,zero,zinke,zombie,zombie apocalypse,zombies,zone,zones,zoo,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
train_cv.toarray().sum(axis=0);

In [36]:
rf = RandomForestClassifier()

In [37]:
rf.fit(train_cv.toarray(),y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [38]:
print(rf.score(train_cv.toarray(),y_train))
print(rf.score(test_cv.toarray(),y_test))

0.9816568836712913
0.8195278111244498


In [39]:
nb = MultinomialNB()

In [40]:
cross_val_score(nb,train_cv.toarray(),y_train,cv=kf).mean()

0.8676620805082756

In [41]:
rf_params = {
    'n_estimators': [250,300,450],
    #'max_depth': [200,400,800],
    

    
    
}

In [181]:
# gs = GridSearchCV(rf,param_grid=rf_params,n_jobs=-2)

# gs.fit(train_cv.toarray(), y_train)
# print(gs.best_score_)
# gs.best_params_