# Final Model using New Data

### Note: This will return new data to test the model only if run several days after the Cleaning Notebook.  Otherwise you end up pulling the same data. 

In [1]:
import requests
import pandas as pd

In [2]:
# r/TheOnion

url_o = 'https://api.pushshift.io/reddit/search/submission/?&subreddit=TheOnion&size=200'
res_o = requests.get(url_o) 
res_o.json().keys()
res_o.json()['data']
pd.set_option('max_columns', 99)
df_o = pd.DataFrame(res_o.json()['data'])

In [3]:
# r/notheonion

url_n = 'https://api.pushshift.io/reddit/search/submission/?&subreddit=nottheonion&size=200'
res_n = requests.get(url_n) 
res_n.json().keys()
res_n.json()['data']
pd.set_option('max_columns', 99)
df_n = pd.DataFrame(res_n.json()['data'])

In [4]:
# concat into one dataframe
data_new = pd.concat([df_n , df_o], sort = False)

In [5]:
# pull columns I'm interested in
data_new = data_new[['title', 'subreddit']].copy()

In [6]:
# Have I got it all?
data_new['subreddit'].value_counts()

nottheonion    200
TheOnion       200
Name: subreddit, dtype: int64

In [7]:
data_new['subreddit'] = data_new['subreddit'].replace (('TheOnion', 'nottheonion'), (1,0))

In [8]:
data_new.head()

Unnamed: 0,title,subreddit
0,Young Dro Reportedly Arrested After Throwing B...,0
1,Tofurky and ACLU cook up suit against Arkansas...,0
2,Trump baits John Bolton in front of officials ...,0
3,Watermelons to Replace Piglets in California F...,0
4,Mysterious ‘untouched’ In-N-Out burger found l...,0


In [9]:
data_new.to_csv('data_new.csv', index=False)

### Run data through the Multinomial Naive Bayes with TFIDF model for evaluation 

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [11]:
data_new.shape

(400, 2)

In [12]:
X = data_new['title']
y = data_new['subreddit']

In [13]:
X = TfidfVectorizer(max_features=4000, max_df=0.4, ).fit_transform(X)

In [14]:
# X = CountVectorizer(max_features=3000, max_df=0.3, ).fit_transform(X)

In [15]:
X = X.todense()

In [16]:
# After vectorization of the text, using Multinomial Bayes to predict
mNB = MultinomialNB(alpha = 1, fit_prior = False)
mNB.fit(X,y)
predictions = mNB.predict(X)

In [17]:
confusion_matrix(y, predictions).ravel()
tn, fp, fn, tp = confusion_matrix(y, predictions).ravel()

spec = tn / (tn + fp)
print(f'Specificity: {round(spec,4)}')

sens = tp / (tp + fn)
print(f'Sensitivty: {round(sens,4)}')

precision = tp/(tp+fp)
print(f'Precision: {round(precision,4)}')

print(f'Accuracy/Score: {mNB.score(X, y)}')

Specificity: 1.0
Sensitivty: 0.995
Precision: 1.0
Accuracy/Score: 0.9975


In [18]:
conmat = confusion_matrix(y, predictions)
pd.DataFrame(conmat, columns=['Predicted 0', 'Predicted 1'], index=['True 0', \
                                                                          'True 1'])

Unnamed: 0,Predicted 0,Predicted 1
True 0,200,0
True 1,1,199
