# CountVectorized Multinominal Naive Bayes

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix


In [2]:
data = pd.read_csv('data.csv')

In [3]:
X = data['title']
y = data['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Using Pipeline and Gridsearch to tune the parameters for both Countvectorizer and Multinomial Bayes Classifier

In [4]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('mnb', MultinomialNB())
    ])

In [5]:
grid = {
    'cvec__max_features': [3000, 3500],
    'cvec__min_df': [1, 2],
    'cvec__max_df': [0.2, 0.3], #  auto stop_words based on freq. 
    'cvec__ngram_range': [(1,1)],
    'mnb__alpha': [2],
    'mnb__fit_prior': [False]
}

In [6]:
gs = GridSearchCV(pipe, param_grid = grid, verbose=1, cv= 5, n_jobs = 1 )
gs.fit(X_train, y_train)

print(f'Best Parmas: {gs.best_params_}')
print(f'Best Score: {gs.best_score_}')

print(f'Train Score: {gs.score(X_train, y_train)}')
print(f'Test Score: {gs.score(X_test, y_test)}')

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best Parmas: {'cvec__max_df': 0.3, 'cvec__max_features': 3000, 'cvec__min_df': 1, 'cvec__ngram_range': (1, 1), 'mnb__alpha': 2, 'mnb__fit_prior': False}
Best Score: 0.8353333333333334
Train Score: 0.9346666666666666
Test Score: 0.822


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    1.6s finished


### Use the best parameters from Gridsearch to fit both the Vectorizer and the Classifier

In [7]:
cv = CountVectorizer(gs.best_params_)
X_train = cv.fit_transform(X_train,y_train)
X_test = cv.transform(X_test)

In [8]:
mNB = MultinomialNB(alpha = 3, fit_prior = False)

mNB.fit(X_train, y_train)
predictions = mNB.predict(X_test)

### Score the model

In [9]:
confusion_matrix(y_test, predictions).ravel()
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()


spec = tn / (tn + fp)
print(f'Specificity: {round(spec,4)}')

sens = tp / (tp + fn)
print(f'Sensitivty: {round(sens,4)}')

precision = tp/(tp+fp)
print(f'Precision: {round(precision,4)}')

print(f'Accuracy/Score: {mNB.score(X_test, y_test)}')

Specificity: 0.7704
Sensitivty: 0.8765
Precision: 0.7831
Accuracy/Score: 0.822


In [10]:
conmat = confusion_matrix(y_test, predictions)
pd.DataFrame(conmat, columns=['Predicted RealNews', 'Predicted Satire'], index=['True News', \
                                                                          'True Satire'])

Unnamed: 0,Predicted RealNews,Predicted Satire
True News,198,59
True Satire,30,213
