In [40]:
import pandas as pd
import json
import requests
import time
import matplotlib.pyplot as plt

In [3]:
headers={'User-agent':'E bot 1.0'}
after=None
shower_thoughts=[]

url = "https://www.reddit.com/r/Showerthoughts/.json"

for i in range(20):
    if after==None:
        params={}
    else:
        params={'after':after}
    
    res=requests.get(url=url,params=params,headers=headers)
    if res.status_code==200:
        sh_thoughts_json=res.json()
        shower_thoughts.extend(sh_thoughts_json['data']['children'])
        after=sh_thoughts_json['data']['after']
    else:
        print(res.status_code)
        break
    
    time.sleep(2)

In [14]:
# the range of the iteration starts from 2 because the first 2 
# elements are not posts but instructions related to the page
shower_thoughts_list = [shower_thoughts[i]['data']['title'] + ' ' +
                        shower_thoughts[i]['data']['selftext'] 
                        for i in range(2,len(shower_thoughts))]

st_df=pd.DataFrame(shower_thoughts_list,columns=['post'])
st_df['subredit']='shower_thoughts'
st_df.head()


Unnamed: 0,post,subredit
0,Has anyone ever noticed that the Hacker in eve...,shower_thoughts
1,We’re as far from the start of the ‘70s as we ...,shower_thoughts
2,When Baby Yoda learns to speak he probably wil...,shower_thoughts
3,Dogs being clever surprises us because they se...,shower_thoughts
4,"If you expect a surprise, and it doesn't happe...",shower_thoughts


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import stop_words
from nltk.corpus import stopwords

In [8]:
my_stopwords=stopwords.words('english')

In [10]:
my_stopwords.extend(['amp','x200b','\n'])
my_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
st_df['subredit'].replace({'shower_thoughts':0,'deep_philosphy':1},inplace=True)
X=st_df['post']
y=st_df['subredit']
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,test_size=0.2)
tfidvec=TfidfVectorizer(stop_words=my_stopwords)
tfidvec.fit(X_train)
X_train_tfidf = tfidvec.transform(X_train)
X_test_tfidf = tfidvec.transform(X_test)

In [22]:
from sklearn.metrics import confusion_matrix

def conf_matrix(model, X_test):
    y_pred = model.predict(X_test)           
    cm = confusion_matrix(y_test, y_pred)     
    tn, fp, fn, tp = cm.ravel()               
    print(f"True Negatives: {tn}")            
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")            
    return pd.DataFrame(cm, 
                        columns = ['Pred Deep Philosophy','Pred Shower Thoughts'], 
                        index = ['Act Deep Philosphy', 'Act Shower Thoughts'])

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [64]:
steps=[
    ("vectorizer",TfidfVectorizer(stop_words=my_stopwords)),
    ("rf",RandomForestClassifier())
]
pipe=Pipeline(steps)
params={
    "vectorizer__max_features": [2000, 3000, 4000],
    "vectorizer__ngram_range":[(1,1), (1,2)],
    "rf__n_estimators": [2500, 3000, 3500],
    "rf__max_depth": [17, 18, 19, 20],
    "rf__min_samples_leaf": [1, 2, 3]
}

gs=GridSearchCV(pipe,param_grid=params,verbose=1,n_jobs=2)

In [82]:
gs

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'vectorizer__max_features': [2000, 3000, 4000], 'vectorizer__ngram_range': [(1, 1), (1, 2)], 'rf__n_estimators': [2500, 3000, 3500], 'rf__max_depth': [17, 18, 19, 20], 'rf__min_samples_leaf': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [76]:
transformer = TfidfVectorizer(lowercase=True, stop_words=my_stopwords)
X = transformer.fit_transform(st_df.post)
le = LabelEncoder()
y = le.fit_transform(st_df.subredit)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

print(le.inverse_transform(model.predict(X_test)))
#plt.figure()
importances = model.feature_importances_
s=pd.Series(importances)
#s.nlargest(10).plot(kind='barh')

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


