# TF - IDF Multinominal Naive Bayes Model

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix


In [2]:
data = pd.read_csv('data.csv')

In [3]:
X = data['title']
y = data['subreddit']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Using Pipeline and Gridsearch to tune the parameters for both TF IDF Vectorizer and Multinomial Bayes Classifier

In [5]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())
    ])

In [6]:
grid = {
    'tfidf__max_features': [ 3000, 4000],
    'tfidf__min_df': [1, 2],
    'tfidf__max_df': [0.3, 0.4, 0.5],  
    'tfidf__ngram_range': [(1,1)],
    'mnb__alpha': [1, 2],
    'mnb__fit_prior': [False]
}

In [7]:
gs = GridSearchCV(pipe, param_grid = grid, verbose=1, cv= 5, n_jobs = 1 )
gs.fit(X_train, y_train)

print(f'Best Parmas: {gs.best_params_}')
print(f'Best Score: {gs.best_score_}')

print(f'Train Score: {gs.score(X_train, y_train)}')
print(f'Test Score: {gs.score(X_test, y_test)}')

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best Parmas: {'mnb__alpha': 1, 'mnb__fit_prior': False, 'tfidf__max_df': 0.3, 'tfidf__max_features': 4000, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 1)}
Best Score: 0.838
Train Score: 0.968
Test Score: 0.836


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    5.1s finished


### Use the best parameters from Gridsearch to fit both the Vectorizer and the Classifier

In [8]:
tfid = TfidfVectorizer(gs.best_params_)
X_train = tfid.fit_transform(X_train,y_train)
X_test = tfid.transform(X_test)

In [9]:
mNB = MultinomialNB(alpha = 1, fit_prior = False)

mNB.fit(X_train, y_train)
predictions = mNB.predict(X_test)

In [10]:
mNB.coef_

array([[-7.89363998, -8.04647646, -8.46269378, ..., -8.95826728,
        -8.95826728, -8.95826728]])

### Score the model

In [11]:
confusion_matrix(y_test, predictions).ravel()
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

spec = tn / (tn + fp)
print(f'Specificity: {round(spec,4)}')

sens = tp / (tp + fn)
print(f'Sensitivty: {round(sens,4)}')

precision = tp/(tp+fp)
print(f'Precision: {round(precision,4)}')

print(f'Accuracy/Score: {mNB.score(X_test, y_test)}')

Specificity: 0.8171
Sensitivty: 0.8519
Precision: 0.815
Accuracy/Score: 0.834


In [12]:
conmat = confusion_matrix(y_test, predictions)
confusion_matrix(y_test, predictions)

array([[210,  47],
       [ 36, 207]])

In [13]:
conmat = confusion_matrix(y_test, predictions)
pd.DataFrame(conmat, columns=['Predicted RealNews', 'Predicted Satire'], index=['True News', \
                                                                                'True Satire'])

Unnamed: 0,Predicted RealNews,Predicted Satire
True News,210,47
True Satire,36,207
