## Import libraries

In [1]:
import pandas as pd, numpy as np

import time

from tqdm import tqdm

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier 
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import datetime

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Load datasets

In [2]:
X_train = pd.read_csv('../tokenized_data/X_train.csv',index_col = 0)
X_test = pd.read_csv('../tokenized_data/X_test.csv',index_col = 0)
y_train = pd.read_csv('../tokenized_data/y_train.csv',index_col = 0)
y_test = pd.read_csv('../tokenized_data/y_test.csv',index_col = 0)

## GridSearchCV

1. Make use of `GridSearchCV` to tune vectorizer parameters for each of the following 6 classifiers firstly using `CountVectorizer`:
 - Multinomial Naive Bayes
 - K-Nearest Neighbors
 - Logistic Regression
 - Random Forest
 - AdaBoost (adaptive boost)
 - Gradient Boost
 
 
2. Repeat the GridSearches with varying tokenized input from notebook **02_Preprocessing**, namely, original tokens, stemmed tokens and lemmatized tokens

3. Repeat the GridSearches on best-performing version of tokens for `TfidfVectorizer`. 

4. Fine tune the best classifier coupled with one of the better feature extraction techniques (Vectorizers) using `GridSearchCV`, again

In [3]:
def gridsearch_results(X_train, X_test, y_train, y_test, steps_list, steps_titles, pipe_params):
    # instantiate results DataFrame
    grid_results = pd.DataFrame(columns=['model','best_params','train_accuracy','test_accuracy',
                                     'tn','fp','fn','tp',
                                     'sensitivity/recall','specificity','precision'])
    for i in tqdm(range(len(steps_list))):          # time each iteration
        # configure pipeline for each classifier
        pipe = Pipeline(steps=steps_list[i])        
        # grid search using the default parameters of the classifier
        grid = GridSearchCV(pipe, pipe_params, cv=5, n_jobs=-1) 

        model_results = {}
        grid.fit(X_train, y_train)
        
        model_results['model'] = steps_titles[i]
        model_results['best_params'] = grid.best_params_
        model_results['train_accuracy'] = grid.score(X_train, y_train)
        model_results['test_accuracy'] = grid.score(X_test, y_test)

        # Store confusion matrix results 
        tn, fp, fn, tp = confusion_matrix(y_test, grid.predict(X_test)).ravel() 
        model_results['tn'] = tn
        model_results['fp'] = fp
        model_results['fn'] = fn
        model_results['tp'] = tp
        model_results['sensitivity/recall'] = tp / (tp + fn)
        model_results['specificity'] = tn / (tn + fp)
        model_results['precision'] = tp / (tp + fp)
        
        print('Model: ',steps_titles[i])
        print('Best Params: ', grid.best_params_)
        
        grid_results = grid_results.append(model_results, ignore_index=True)
        #print(grid_results)
    return grid_results

## CountVectorizer

In [7]:
steps_list_gr_cv = [
    [('cv',CountVectorizer()),('multi_nb',MultinomialNB())],
    [('cv',CountVectorizer()),('knn',KNeighborsClassifier())], 
    [('cv',CountVectorizer()),('logreg',LogisticRegression())],
    [('cv',CountVectorizer()),('rf',RandomForestClassifier())],
    [('cv',CountVectorizer()),('ada',AdaBoostClassifier())],
    [('cv',CountVectorizer()),('gb',GradientBoostingClassifier())]
]

In [8]:
steps_titles_list = ['multi_nb','knn','logreg','rf','ada','gb']

In [38]:
pipe_params_cv =  {"cv__stop_words":['english'], 
                   "cv__ngram_range":[(1,1),(1,2)], 
                   'cv__max_df' : [1.0, 0.90]}

### Use original tokens (without stemming or lemmatizing)

In [10]:
X_train_raw = X_train['post']
X_test_raw = X_test['post']

In [39]:
grid_results_cv = gridsearch_results(X_train_raw, X_test_raw, y_train, y_test, steps_list_gr_cv, steps_titles_list, pipe_params_cv)

 17%|█▋        | 1/6 [00:09<00:49,  9.83s/it]

Model:  multi_nb
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}


 33%|███▎      | 2/6 [00:21<00:41, 10.30s/it]

Model:  knn
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}


 50%|█████     | 3/6 [00:30<00:30, 10.12s/it]

Model:  logreg
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}


 67%|██████▋   | 4/6 [00:41<00:20, 10.17s/it]

Model:  rf
Best Params:  {'cv__max_df': 0.9, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}


 83%|████████▎ | 5/6 [01:03<00:13, 13.67s/it]

Model:  ada
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}


100%|██████████| 6/6 [02:03<00:00, 27.75s/it]

Model:  gb
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}





In [40]:
grid_results_cv.sort_values('test_accuracy',ascending=False)

Unnamed: 0,model,best_params,train_accuracy,test_accuracy,tn,fp,fn,tp,sensitivity/recall,specificity,precision
0,multi_nb,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 1),...",0.969,0.929326,284,54,33,860,0.963046,0.840237,0.940919
2,logreg,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 2),...",0.999652,0.911454,243,95,14,879,0.984323,0.718935,0.902464
3,rf,"{'cv__max_df': 0.9, 'cv__ngram_range': (1, 1),...",0.996865,0.904955,260,78,39,854,0.956327,0.769231,0.916309
5,gb,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 1),...",0.931034,0.904955,242,96,21,872,0.976484,0.715976,0.900826
4,ada,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 2),...",0.92581,0.896832,239,99,28,865,0.968645,0.707101,0.897303
1,knn,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 1),...",0.89899,0.848091,179,159,28,865,0.968645,0.529586,0.844727


Multinomial Naive Bayes looks quite good already with original tokens. Let's re-run the classifiers on the other two versions of tokens and see if stemming/lemmatizing is going to help at all. 

### Use stemming tokens

In [41]:
X_train_st = X_train['post_st']
X_test_st = X_test['post_st']

## `Please skip this chunk of gridsearch to save time if you are testing the code`

In [42]:
grid_results_cv = gridsearch_results(X_train_st, X_test_st, y_train, y_test, steps_list_gr_cv, steps_titles_list, pipe_params_cv)

 17%|█▋        | 1/6 [00:07<00:39,  7.95s/it]

Model:  multi_nb
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}


 33%|███▎      | 2/6 [00:22<00:39,  9.80s/it]

Model:  knn
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}


 50%|█████     | 3/6 [00:31<00:29,  9.80s/it]

Model:  logreg
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}


 67%|██████▋   | 4/6 [00:41<00:19,  9.73s/it]

Model:  rf
Best Params:  {'cv__max_df': 0.9, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}


 83%|████████▎ | 5/6 [01:00<00:12, 12.53s/it]

Model:  ada
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}


100%|██████████| 6/6 [02:07<00:00, 28.81s/it]

Model:  gb
Best Params:  {'cv__max_df': 0.9, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}





In [43]:
grid_results_cv.sort_values('test_accuracy',ascending=False)

Unnamed: 0,model,best_params,train_accuracy,test_accuracy,tn,fp,fn,tp,sensitivity/recall,specificity,precision
0,multi_nb,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 1),...",0.963079,0.929326,288,50,37,856,0.958567,0.852071,0.944812
3,rf,"{'cv__max_df': 0.9, 'cv__ngram_range': (1, 1),...",0.995472,0.913891,271,67,39,854,0.956327,0.801775,0.927253
2,logreg,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 2),...",0.999303,0.909017,245,93,19,874,0.978723,0.724852,0.903826
4,ada,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 2),...",0.930338,0.907392,247,91,23,870,0.974244,0.730769,0.905307
5,gb,"{'cv__max_df': 0.9, 'cv__ngram_range': (1, 1),...",0.935911,0.905768,244,94,22,871,0.975364,0.721893,0.902591
1,knn,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 1),...",0.895507,0.839968,187,151,46,847,0.948488,0.553254,0.848697


Looks like stemming is not really helping! This matches with our initial speculation that for a novel series like Harry Potter / Fantastic Beasts that involve a lot of words coided by the author, stemming/lemmatizing may not be useful.

### Use lemmatizing tokens

In [44]:
X_train_lm = X_train['post_lm']
X_test_lm = X_test['post_lm']

## `Please skip this chunk of gridsearch to save time if you are testing the code`

In [45]:
grid_results_cv = gridsearch_results(X_train_lm, X_test_lm, y_train, y_test, steps_list_gr_cv, steps_titles_list, pipe_params_cv)

 17%|█▋        | 1/6 [00:07<00:37,  7.56s/it]

Model:  multi_nb
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}


 33%|███▎      | 2/6 [00:20<00:36,  9.08s/it]

Model:  knn
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}


 50%|█████     | 3/6 [00:30<00:28,  9.38s/it]

Model:  logreg
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}


 67%|██████▋   | 4/6 [00:39<00:18,  9.43s/it]

Model:  rf
Best Params:  {'cv__max_df': 0.9, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}


 83%|████████▎ | 5/6 [00:58<00:12, 12.14s/it]

Model:  ada
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 2), 'cv__stop_words': 'english'}


100%|██████████| 6/6 [01:53<00:00, 25.19s/it]

Model:  gb
Best Params:  {'cv__max_df': 1.0, 'cv__ngram_range': (1, 1), 'cv__stop_words': 'english'}





In [46]:
grid_results_cv.sort_values('test_accuracy',ascending=False)

Unnamed: 0,model,best_params,train_accuracy,test_accuracy,tn,fp,fn,tp,sensitivity/recall,specificity,precision
0,multi_nb,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 1),...",0.963079,0.929326,288,50,37,856,0.958567,0.852071,0.944812
3,rf,"{'cv__max_df': 0.9, 'cv__ngram_range': (1, 1),...",0.997214,0.910642,269,69,41,852,0.954087,0.795858,0.925081
2,logreg,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 2),...",0.999303,0.909017,245,93,19,874,0.978723,0.724852,0.903826
4,ada,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 2),...",0.930338,0.907392,247,91,23,870,0.974244,0.730769,0.905307
5,gb,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 1),...",0.937304,0.901706,242,96,25,868,0.972004,0.715976,0.900415
1,knn,"{'cv__max_df': 1.0, 'cv__ngram_range': (1, 1),...",0.895507,0.839968,187,151,46,847,0.948488,0.553254,0.848697


The results are not very different from the results from stemming tokens. Therefore I will just stick to the original tokens.

## TF-IDF Vectorizer

In [47]:
steps_list_gr_tf = [ # list of pipeline steps for each model combo
    [('tf',TfidfVectorizer()),('multi_nb',MultinomialNB())],
    [('tf',TfidfVectorizer()),('knn',KNeighborsClassifier())], 
    [('tf',TfidfVectorizer()),('logreg',LogisticRegression())],
    [('tf',TfidfVectorizer()),('rf',RandomForestClassifier())],
    [('tf',TfidfVectorizer()),('ada',AdaBoostClassifier())],
    [('tf',TfidfVectorizer()),('gb',GradientBoostingClassifier())]
]

In [48]:
steps_titles_list = ['multi_nb','knn','logreg','rf','ada','gb']

In [49]:
pipe_params_tf = {"tf__stop_words":['english'], 
                  "tf__ngram_range":[(1,1),(1,2)],
                  'tf__max_df' : [1.0, 0.90]}

## `Please skip this chunk of gridsearch to save time if you are testing the code`

In [52]:
grid_results_tf = gridsearch_results(X_train_raw, X_test_raw, y_train, y_test, steps_list_gr_tf, steps_titles_list, pipe_params_tf)

 17%|█▋        | 1/6 [00:08<00:40,  8.08s/it]

Model:  multi_nb
Best Params:  {'tf__max_df': 1.0, 'tf__ngram_range': (1, 1), 'tf__stop_words': 'english'}


 33%|███▎      | 2/6 [00:20<00:37,  9.32s/it]

Model:  knn
Best Params:  {'tf__max_df': 1.0, 'tf__ngram_range': (1, 2), 'tf__stop_words': 'english'}


 50%|█████     | 3/6 [00:28<00:26,  8.94s/it]

Model:  logreg
Best Params:  {'tf__max_df': 1.0, 'tf__ngram_range': (1, 1), 'tf__stop_words': 'english'}


 67%|██████▋   | 4/6 [00:37<00:18,  9.15s/it]

Model:  rf
Best Params:  {'tf__max_df': 0.9, 'tf__ngram_range': (1, 1), 'tf__stop_words': 'english'}


 83%|████████▎ | 5/6 [00:57<00:12, 12.21s/it]

Model:  ada
Best Params:  {'tf__max_df': 1.0, 'tf__ngram_range': (1, 1), 'tf__stop_words': 'english'}


100%|██████████| 6/6 [02:19<00:00, 33.17s/it]

Model:  gb
Best Params:  {'tf__max_df': 0.9, 'tf__ngram_range': (1, 2), 'tf__stop_words': 'english'}





In [53]:
grid_results_tf.sort_values('test_accuracy',ascending=False)

Unnamed: 0,model,best_params,train_accuracy,test_accuracy,tn,fp,fn,tp,sensitivity/recall,specificity,precision
3,rf,"{'tf__max_df': 0.9, 'tf__ngram_range': (1, 1),...",0.99791,0.913891,258,80,26,867,0.970885,0.763314,0.915523
4,ada,"{'tf__max_df': 1.0, 'tf__ngram_range': (1, 1),...",0.939394,0.913891,256,82,24,869,0.973124,0.757396,0.913775
5,gb,"{'tf__max_df': 0.9, 'tf__ngram_range': (1, 2),...",0.945664,0.913079,254,84,23,870,0.974244,0.751479,0.91195
1,knn,"{'tf__max_df': 1.0, 'tf__ngram_range': (1, 2),...",0.919192,0.887084,241,97,42,851,0.952968,0.713018,0.897679
2,logreg,"{'tf__max_df': 1.0, 'tf__ngram_range': (1, 1),...",0.9279,0.885459,211,127,14,879,0.984323,0.62426,0.873757
0,multi_nb,"{'tf__max_df': 1.0, 'tf__ngram_range': (1, 1),...",0.869732,0.827782,128,210,2,891,0.99776,0.378698,0.809264


The rank of classifiers differ from CountVectorizer. 

However, it doesn't really improve further on the top-performing MultinomialNB from CountVectorizer.

Let's stick to CounterVectorizer + MultinomialNB using original tokens and fine tune this model.

## Optimize the model

1. Check where the predictions go wrong
2. Check if imbalance needs to addressed
3. Tune classifier parameters and see if it can be further improved

In [54]:
# Pick the MultinomialNB model with best_params and 
# understand the gap between actual classification and the prediction
cvt = CountVectorizer(stop_words='english', lowercase=True, ngram_range=(1,1))
pipeline = Pipeline([
    ('vect', cvt),
    ('cls', MultinomialNB())
]) 
pipeline.fit(X_train_raw, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [55]:
model_results = {}
model_results['train_accuracy'] = pipeline.score(X_train_raw, y_train)
model_results['test_accuracy'] = pipeline.score(X_test_raw, y_test)
model_results 
# just a confirmation that the model is reproducible - the score should match the GridSearchCV results

{'train_accuracy': 0.9690003483106931, 'test_accuracy': 0.9293257514216084}

In [56]:
pipeline.named_steps

{'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words='english',
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None),
 'cls': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)}

In [57]:
mnb = pipeline.named_steps['cls']
cv = pipeline.named_steps['vect']

In [58]:
# Function from https://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers

def important_features(vectorizer,classifier,n=30):
    class_labels = classifier.classes_
    feature_names =vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.feature_count_[0], feature_names),reverse=True)[:n]
    topn_class2 = sorted(zip(classifier.feature_count_[1], feature_names),reverse=True)[:n]
    print("Important words for Fantastic Beasts subreddit\n")
    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)
    print("-----------------------------------------\n")
    print("Important words for Harry Potter subreddit\n")
    for coef, feat in topn_class2:
        print(class_labels[1], coef, feat)

In [59]:
important_features(cv, mnb, 30)

Important words for Fantastic Beasts subreddit

0 1037.0 grindelwald
0 716.0 credence
0 698.0 dumbledore
0 579.0 movie
0 567.0 newt
0 554.0 just
0 521.0 think
0 502.0 know
0 490.0 like
0 315.0 leta
0 297.0 beasts
0 286.0 fantastic
0 282.0 time
0 276.0 maybe
0 272.0 did
0 266.0 albus
0 238.0 don
0 231.0 theory
0 228.0 really
0 223.0 love
0 214.0 way
0 212.0 people
0 211.0 crimes
0 209.0 queenie
0 205.0 family
0 198.0 harry
0 198.0 does
0 194.0 story
0 189.0 tina
0 187.0 didn
-----------------------------------------

Important words for Harry Potter subreddit

1 2216.0 harry
1 1095.0 just
1 1027.0 like
1 915.0 potter
1 660.0 know
1 654.0 think
1 543.0 time
1 524.0 voldemort
1 513.0 books
1 479.0 ve
1 474.0 https
1 468.0 hogwarts
1 468.0 dumbledore
1 459.0 book
1 445.0 did
1 441.0 don
1 434.0 people
1 415.0 com
1 397.0 really
1 390.0 house
1 362.0 magic
1 343.0 hufflepuff
1 336.0 read
1 332.0 world
1 328.0 ravenclaw
1 322.0 does
1 311.0 slytherin
1 310.0 hermione
1 309.0 didn
1 305.0 lov

In [60]:
X = pd.concat([X_train, X_test],ignore_index=True)
y = pd.concat([y_train, y_test],ignore_index=True)

In [133]:
y.head()

Unnamed: 0,is_hp
0,1
1,0
2,0
3,1
4,1


In [138]:
predicted = model.predict(X['post'])

In [139]:
# Incorrectly classified
incorrect_preds = X[(predicted != y['is_hp'])]

In [140]:
incorrect_preds.shape[0]

176

In [141]:
incorrect_df = pd.DataFrame({'actual': y['is_hp'][incorrect_preds.index], 
                             'predicted': predicted[incorrect_preds.index],
                             'text': incorrect_preds['all_text']})

In [142]:
incorrect_df.head()

Unnamed: 0,actual,predicted,text
18,1,0,Opinions On Fantastic Beasts And Where To Find...
30,0,1,Looking for quotes for a birthday card A frie...
82,0,1,"Muggles, no maj, and “can’t spells” What did e..."
122,1,0,Hogwarts Mystery: Jacob's bedroom (might conta...
186,0,1,[NO SPOILERS] Snow storm ruined my night and w...


In [143]:
incorrect_df.tail()

Unnamed: 0,actual,predicted,text
4038,1,0,Is jk rowling Alive? She has completely disapp...
4056,0,1,Why In The World Are Wizards Afraid On Muggles...
4067,1,0,Maledictus Question Is there a set age for eve...
4096,0,1,Please share your theories about Grindelwald's...
4100,0,1,My cinema is having a screening tonight! Alrea...


In [144]:
incorrect_df.to_csv('../data/incorrect_preds.csv')

After checking a few of the posts, I think it makes sense for the discrepancy between predictions and actual subreddit to happen.
There are inherently some link between the Harry Potter series and the Fantastic Beasts series. To me, some of the 'incorrect predictions' actually did a better job than human in classifying the posts.

### So, does the imbalance of classes in the dataset matter?

I am just going to use simple oversampling on FantasticBeasts dataset to make it match up with HarryPotter sample size.

In [13]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

In [14]:
from collections import Counter
print(sorted(Counter(y_train_resampled).items()))

[(0, 2082), (1, 2082)]


In [16]:
X_train_resampled = pd.DataFrame(X_train_resampled, columns=['post','post_st','post_lm','all_text']) 

In [17]:
X_train_resampled.head()

Unnamed: 0,post,post_st,post_lm,all_text
0,they should make a movie about voldemort that ...,they should make a movi about voldemort that e...,they should make a movi about voldemort that e...,They should make a movie about Voldemort that ...
1,beauty and the beast get paid homage by j k ro...,beauti and the beast get paid homag by j k row...,beauti and the beast get paid homag by j k row...,Beauty and the Beast get paid Homage by J K Ro...
2,spoilers now that we know that nagini was in f...,spoiler now that we know that nagini wa in fac...,spoiler now that we know that nagini wa in fac...,[spoilers] Now that we know that nagini was in...
3,merchandise monday welcome to merchandise mond...,merchandis monday welcom to merchandis monday ...,merchandis monday welcom to merchandis monday ...,Merchandise Monday! Welcome to Merchandise Mon...
4,how did witches save themselves when burning a...,how did witch save themselv when burn at the s...,how did witch save themselv when burn at the s...,How did witches save themselves when burning a...


In [18]:
y_train_resampled = pd.DataFrame(y_train_resampled, columns=['is_hp']) 

In [19]:
X_train_resampled_raw = X_train_resampled['post']
X_test_raw = X_test['post']

In [20]:
# Pass the resampled training data to te model
pipeline_resampled = Pipeline([
    ('vect', cvt),
    ('cls', MultinomialNB())
]) 
pipeline_resampled.fit(X_train_resampled_raw, y_train_resampled)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [21]:
model_results = {}
model_results['train_accuracy'] = pipeline_resampled.score(X_train_resampled_raw, y_train_resampled)
model_results['test_accuracy'] = pipeline_resampled.score(X_test_raw, y_test)

In [22]:
model_results

{'train_accuracy': 0.9731027857829011, 'test_accuracy': 0.9268887083671812}

Oversampling helped a little bit on the accuracy of training dataset (from 0.968 to 0.973), but doesn't really help further improve the accuracy on test set (from 0.933 to 0.927)...the original model is probably already doing good enough since the impact from imbalance is minimal.

### GridSearchCV for fine tuning MultinomialNB base model

In [61]:
mnb_steps = [('cv',cvt),
            ('mnb',MultinomialNB())]

# Fine tune parameters for MultinomialNB model
mnb_params = {"mnb__alpha":np.arange(.05, 2, .05)}

pipe = Pipeline(mnb_steps)
pipe.get_params()

{'memory': None,
 'steps': [('cv',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words='english',
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=None, vocabulary=None)),
  ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
 'cv': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words='english',
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None),
 'mnb': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 'cv__analyzer': 'word',
 'cv__b

In [62]:
model = GridSearchCV(pipe, mnb_params, cv=5) 
model.fit(X_train_raw, y_train) 

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=N...nizer=None, vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'mnb__alpha': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  , 1.05, 1.1 ,
       1.15, 1.2 , 1.25, 1.3 , 1.35, 1.4 , 1.45, 1.5 , 1.55, 1.6 , 1.65,
       1.7 , 1.75, 1.8 , 1.85, 1.9 , 1.95])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [63]:
mnb_results = {}
mnb_results['best_params'] = model.best_params_
mnb_results['train_accuracy'] = model.score(X_train_raw, y_train)
mnb_results['test_accuracy'] = model.score(X_test_raw, y_test)

# Store confusion matrix results 
tn, fp, fn, tp = confusion_matrix(y_test, model.predict(X_test_raw)).ravel() 
mnb_results['tn'] = tn
mnb_results['fp'] = fp
mnb_results['fn'] = fn
mnb_results['tp'] = tp
mnb_results['sensitivity/recall'] = tp / (tp + fn)
mnb_results['specificity'] = tn / (tn + fp)
mnb_results['precision'] = tp / (tp + fp)


mnb_results

{'best_params': {'mnb__alpha': 0.9500000000000001},
 'train_accuracy': 0.9693486590038314,
 'test_accuracy': 0.9301380991064175,
 'tn': 285,
 'fp': 53,
 'fn': 33,
 'tp': 860,
 'sensitivity/recall': 0.9630459126539753,
 'specificity': 0.8431952662721893,
 'precision': 0.9419496166484118}

Accuracy improved a little bit from 0.929 to 0.930 with alpha changed from 1 to 0.95

## Get a sense of feature importance - what are the keywords in each subreddit?

In [64]:
pipeline = Pipeline([
    ('vect', cvt),
    ('cls', MultinomialNB(alpha=0.95))
]) 
pipeline.fit(X_train_raw, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('cls', MultinomialNB(alpha=0.95, class_prior=None, fit_prior=True))])

In [65]:
pipeline.named_steps

{'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words='english',
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None),
 'cls': MultinomialNB(alpha=0.95, class_prior=None, fit_prior=True)}

In [66]:
mnb = pipeline.named_steps['cls']
cv = pipeline.named_steps['vect']

In [67]:
# Function from https://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers

def important_features(vectorizer,classifier,n=30):
    class_labels = classifier.classes_
    feature_names =vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.feature_count_[0], feature_names),reverse=True)[:n]
    topn_class2 = sorted(zip(classifier.feature_count_[1], feature_names),reverse=True)[:n]
    print("Important words for Fantastic Beasts subreddit\n")
    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)
    print("-----------------------------------------\n")
    print("Important words for Harry Potter subreddit\n")
    for coef, feat in topn_class2:
        print(class_labels[1], coef, feat)

In [68]:
important_features(cv, mnb, 30)

Important words for Fantastic Beasts subreddit

0 1037.0 grindelwald
0 716.0 credence
0 698.0 dumbledore
0 579.0 movie
0 567.0 newt
0 554.0 just
0 521.0 think
0 502.0 know
0 490.0 like
0 315.0 leta
0 297.0 beasts
0 286.0 fantastic
0 282.0 time
0 276.0 maybe
0 272.0 did
0 266.0 albus
0 238.0 don
0 231.0 theory
0 228.0 really
0 223.0 love
0 214.0 way
0 212.0 people
0 211.0 crimes
0 209.0 queenie
0 205.0 family
0 198.0 harry
0 198.0 does
0 194.0 story
0 189.0 tina
0 187.0 didn
-----------------------------------------

Important words for Harry Potter subreddit

1 2216.0 harry
1 1095.0 just
1 1027.0 like
1 915.0 potter
1 660.0 know
1 654.0 think
1 543.0 time
1 524.0 voldemort
1 513.0 books
1 479.0 ve
1 474.0 https
1 468.0 hogwarts
1 468.0 dumbledore
1 459.0 book
1 445.0 did
1 441.0 don
1 434.0 people
1 415.0 com
1 397.0 really
1 390.0 house
1 362.0 magic
1 343.0 hufflepuff
1 336.0 read
1 332.0 world
1 328.0 ravenclaw
1 322.0 does
1 311.0 slytherin
1 310.0 hermione
1 309.0 didn
1 305.0 lov

It makes sense that the main characters / schools are the keywords that differentiate the subreddits.

## Conclusion

With a few iterations, the model that we found most accurate is the MultinomialNB on CountVectorizer with basic tokenizing (no stemming or lemmatizing).

Steps are summarized in the README.me

## Next steps

Use VotingClassifer in Ensemble to further optimize the model.

The models could be further evaluated / optimized with new posts coming to the subreddits.