#### Import modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
p_stemmer = PorterStemmer()

##### Load data

In [4]:
combine_df = pd.read_csv('../data/subreddit_combine_title_body.csv')

In [5]:
combine_df.head()

Unnamed: 0,id,url,comms_num,created,subreddit,title_body
0,17j7oej,https://www.reddit.com/r/wine/comments/17j7oej...,743,2023-10-30 00:18:37,wine,[Megathread] How much is my wine worth? Is it ...
1,1gmbv5t,https://www.reddit.com/r/wine/comments/1gmbv5t...,16,2024-11-08 13:00:27,wine,"Free Talk Friday Bottle porn without notes, ra..."
2,1gogepp,https://i.redd.it/6gdvjahxb60e1.jpeg,21,2024-11-11 08:19:05,wine,Started Journey to Master I have great study m...
3,1goebub,https://www.reddit.com/gallery/1goebub,7,2024-11-11 06:36:49,wine,NV Pierre Peters Howdy Winos! Anyone have any ...
4,1goj0bf,https://www.reddit.com/r/wine/comments/1goj0bf...,13,2024-11-11 10:37:00,wine,"Vouvray Chenin Blanc I mostly drink reds, but ..."


#### Vertorization

In [7]:
# defined base word helper function
lemma_token = WordNetLemmatizer()
stem_token = PorterStemmer()

def lemma_tokenizer(words):
    return ' '.join([lemma_token.lemmatize(w) for w in words.split()])

def stem_tokenizer(words):
    return ' '.join([stem_token.stem(w) for w in words.split()])

In [8]:
# setup features
X = combine_df['title_body']
y = combine_df['subreddit']

In [9]:
# find baseline
y.value_counts(normalize=True).mul(100).round(2)

subreddit
wine    53.16
beer    46.84
Name: proportion, dtype: float64

In [10]:
# train / test split with Lemmatizer
X_lemma = X.apply(lemma_tokenizer)
X_lemma_train, X_lemma_test, y_lemma_train, y_lemma_test = \
train_test_split(X_lemma, y, stratify=y, random_state=42, train_size=0.8, test_size=0.2)

In [11]:
# train / test split with Porter stemmer
X_stem = X.apply(stem_tokenizer)
X_stem_train, X_stem_test, y_stem_train, y_stem_test = \
train_test_split(X_stem, y, stratify=y, random_state=42, train_size=0.8, test_size=0.2)

#### Training models

In [13]:
# defined models and parameters

vectors = {
            'CountVectorizer': {'pipeline': ('vec', CountVectorizer()),
                                'grid_params': {'vec__stop_words': [None, 'english'],
                                           'vec__min_df': [0.01, 0.05, 0.1, 1],
                                          }
                               },
            'TfidfVectorizer': {'pipeline': ('vec', TfidfVectorizer()),
                                'grid_params': {'vec__stop_words': [None, 'english'],
                                           'vec__min_df': [0.01, 0.05, 0.1, 1],
                                          }
                               },
}

models = {'Naive Bayes': {'pipeline': ('nb',MultinomialNB()),
                          'grid_params': {
                                          'nb__alpha': [0.001, 0.05, 0.1, 1]
                                         }
        },
        'Logistic Regression': 
                          {'pipeline': ('lr',LogisticRegression()),
                           'grid_params': {
                                          'lr__C': [0.001, 0.05, 0.1, 1],
                                          'lr__solver': ['liblinear']
                                         }
        },
        'RandomForest Classifier': 
                          {'pipeline': ('rfc', RandomForestClassifier()),
                          'grid_params': {
                                          'rfc__min_samples_split': [2 ,4, 8, 10],
                                          'rfc__min_samples_leaf': [2 ,4, 8, 10],
                                          'rfc__max_depth': [None, 1, 5 ,10, 15]
                                         }
        },
}

datas = {
    'WordNetLemmatizer': {'X_train': X_lemma_train, 'X_test': X_lemma_test, 
                          'y_train': y_lemma_train, 'y_test': y_lemma_test},
    'PorterStemmer' :    {'X_train': X_stem_train, 'X_test': X_stem_test, 
                          'y_train': y_stem_train, 'y_test': y_stem_test}    
}

In [14]:
# combine pipeline and gridsearch params
vec_models = {}
for vec_name, vector in vectors.items():
    for cls_name, model in models.items():
        vec_models[(vec_name, cls_name)] = {
            'pipeline':   Pipeline([
                vector['pipeline'],
                model['pipeline'],
            ]),
            'grid_params': {
                **vector['grid_params'],
                **model['grid_params'],
            }
        }

In [15]:
# run gridsearch
out_models = {}
i = 1
# initialize DataFrame to store score output 
model_df = pd.DataFrame(columns=['Word_Normalizer', 'Vertorizer_name', 'Model_Name', 'CV_Score', 'Training_Score', 'Testing_Score'])
for group_name, data in datas.items():
    X_train, X_test, y_train, y_test = data['X_train'], data['X_test'], data['y_train'], data['y_test']
    for name, model in vec_models.items():
        vec_name, cls_name = name
        key = (vec_name, cls_name, group_name)
        print(f"{i}. Running gridsearch for ({vec_name}, {cls_name}, {group_name})")
        out_models[key] = GridSearchCV(estimator=model['pipeline'], param_grid=model['grid_params'], \
                            scoring='accuracy', n_jobs=10, cv=2)
        out_models[key].fit(X_train, y_train)
        
        # add store for model
        model_df.loc[i] = [group_name, vec_name, cls_name, out_models[key].best_score_, 
                            out_models[key].score(X_train, y_train), out_models[key].score(X_test, y_test)]
        
        # display model result
        print(f"Best parameters is : {out_models[key].best_params_}")
        print(f"Best cross validation score is : {out_models[key].best_score_:.8f}")
        print(f"Best training score is : {out_models[key].score(X_train, y_train):.8f}")
        print(f"Best testing score is : {out_models[key].score(X_test, y_test):.8f}")
        print('-' * 80)
        i += 1

1. Running gridsearch for (CountVectorizer, Naive Bayes, WordNetLemmatizer)
Best parameters is : {'nb__alpha': 1, 'vec__min_df': 1, 'vec__stop_words': 'english'}
Best cross validation score is : 0.97137212
Best training score is : 0.99439950
Best testing score is : 0.95522388
--------------------------------------------------------------------------------
2. Running gridsearch for (CountVectorizer, Logistic Regression, WordNetLemmatizer)
Best parameters is : {'lr__C': 1, 'lr__solver': 'liblinear', 'vec__min_df': 1, 'vec__stop_words': 'english'}
Best cross validation score is : 0.96328522
Best training score is : 1.00000000
Best testing score is : 0.95771144
--------------------------------------------------------------------------------
3. Running gridsearch for (CountVectorizer, RandomForest Classifier, WordNetLemmatizer)
Best parameters is : {'rfc__max_depth': None, 'rfc__min_samples_leaf': 2, 'rfc__min_samples_split': 8, 'vec__min_df': 1, 'vec__stop_words': 'english'}
Best cross val

In [16]:
model_df['Score_Difference'] = model_df['Training_Score'] - model_df['Testing_Score']
model_df.sort_values(by='CV_Score', ascending=False)

Unnamed: 0,Word_Normalizer,Vertorizer_name,Model_Name,CV_Score,Training_Score,Testing_Score,Score_Difference
7,PorterStemmer,CountVectorizer,Naive Bayes,0.973241,0.9944,0.967662,0.026738
1,WordNetLemmatizer,CountVectorizer,Naive Bayes,0.971372,0.9944,0.955224,0.039176
5,WordNetLemmatizer,TfidfVectorizer,Logistic Regression,0.966399,0.999378,0.967662,0.031716
11,PorterStemmer,TfidfVectorizer,Logistic Regression,0.965153,1.0,0.967662,0.032338
2,WordNetLemmatizer,CountVectorizer,Logistic Regression,0.963285,1.0,0.957711,0.042289
4,WordNetLemmatizer,TfidfVectorizer,Naive Bayes,0.960178,0.996889,0.950249,0.04664
10,PorterStemmer,TfidfVectorizer,Naive Bayes,0.959558,0.996889,0.950249,0.04664
6,WordNetLemmatizer,TfidfVectorizer,RandomForest Classifier,0.95955,0.995022,0.952736,0.042285
3,WordNetLemmatizer,CountVectorizer,RandomForest Classifier,0.958931,0.989421,0.947761,0.04166
8,PorterStemmer,CountVectorizer,Logistic Regression,0.957685,1.0,0.955224,0.044776


#### Analyze models performance

We use `PorterStemmer` for word normalization and `CountVectorizer` for feature extraction with `Naive Bayes` (model #7).
- Best parameters is : {'nb__alpha': 1, 'vec__max_df': 1000, 'vec__min_df': 5, 'vec__stop_words': None}

While the CV score is nearly identical to the best model #1, the low difference between training and testing scores suggests it is not overfitting, unlike model #1.

In [19]:
# select model
model = out_models[('CountVectorizer', 'Naive Bayes', 'PorterStemmer')].best_estimator_

In [20]:
# get data
data = datas['PorterStemmer']
X_train, X_test, y_train, y_test = data['X_train'], data['X_test'], data['y_train'], data['y_test']
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [21]:
# get classification report
score = classification_report(y_test, y_pred, output_dict=True)
score_df = pd.DataFrame(score).T

In [22]:
# dispkay baseline
y.value_counts(normalize=True).round(2)
# classes are well-balanced.

subreddit
wine    0.53
beer    0.47
Name: proportion, dtype: float64

In [23]:
# display classification report
score_df

Unnamed: 0,precision,recall,f1-score,support
beer,0.953368,0.978723,0.965879,188.0
wine,0.980861,0.957944,0.969267,214.0
accuracy,0.967662,0.967662,0.967662,0.967662
macro avg,0.967115,0.968334,0.967573,402.0
weighted avg,0.968004,0.967662,0.967683,402.0
