#### Import modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
p_stemmer = PorterStemmer()

##### Load data

In [4]:
combine_df = pd.read_csv('../data/subreddit_combine_title_body.csv')

In [5]:
combine_df.head()

Unnamed: 0,id,url,comms_num,created,subreddit,title_body
0,17j7oej,https://www.reddit.com/r/wine/comments/17j7oej...,743,2023-10-30 00:18:37,wine,[Megathread] How much is my wine worth? Is it ...
1,1gmbv5t,https://www.reddit.com/r/wine/comments/1gmbv5t...,16,2024-11-08 13:00:27,wine,"Free Talk Friday Bottle porn without notes, ra..."
2,1gogepp,https://i.redd.it/6gdvjahxb60e1.jpeg,21,2024-11-11 08:19:05,wine,Started Journey to Master I have great study m...
3,1goebub,https://www.reddit.com/gallery/1goebub,7,2024-11-11 06:36:49,wine,NV Pierre Peters Howdy Winos! Anyone have any ...
4,1goj0bf,https://www.reddit.com/r/wine/comments/1goj0bf...,13,2024-11-11 10:37:00,wine,"Vouvray Chenin Blanc I mostly drink reds, but ..."


In [6]:
# remove word beer or wine from posts data
combine_df['title_body'] = combine_df['title_body'].str.replace(r'beer|wine','', regex=True, case=False)

#### Vertorization

In [8]:
# defined base word helper function
lemma_token = WordNetLemmatizer()
stem_token = PorterStemmer()

def lemma_tokenizer(words):
    return ' '.join([lemma_token.lemmatize(w) for w in words.split()])

def stem_tokenizer(words):
    return ' '.join([stem_token.stem(w) for w in words.split()])

In [9]:
# setup features
X = combine_df['title_body']
y = combine_df['subreddit']

In [10]:
# find baseline
y.value_counts(normalize=True).mul(100).round(2)

subreddit
wine    53.16
beer    46.84
Name: proportion, dtype: float64

In [11]:
# train / test split with Lemmatizer
X_lemma = X.apply(lemma_tokenizer)
X_lemma_train, X_lemma_test, y_lemma_train, y_lemma_test = \
train_test_split(X_lemma, y, stratify=y, random_state=42, train_size=0.8, test_size=0.2)

In [12]:
# train / test split with Porter stemmer
X_stem = X.apply(stem_tokenizer)
X_stem_train, X_stem_test, y_stem_train, y_stem_test = \
train_test_split(X_stem, y, stratify=y, random_state=42, train_size=0.8, test_size=0.2)

#### Training models

In [14]:
# defined models and parameters

vectors = {
            'CountVectorizer': {'pipeline': ('vec', CountVectorizer()),
                                'grid_params': {'vec__stop_words': [None, 'english'],
                                           'vec__min_df': [2, 3, 5],
                                           'vec__ngram_range': [(1,1), (1,2)]
                                          }
                               },
            'TfidfVectorizer': {'pipeline': ('vec', TfidfVectorizer()),
                                'grid_params': {'vec__stop_words': [None, 'english'],
                                           'vec__min_df': [2, 3, 5],
                                           'vec__ngram_range': [(1,1), (1,2)]
                                          }
                               },
}

models = {'Naive Bayes': {'pipeline': ('nb',MultinomialNB()),
                          'grid_params': {
                                          'nb__alpha': [0.001, 0.05, 0.1, 1]
                                         }
        },
        'Logistic Regression': 
                          {'pipeline': ('lr',LogisticRegression()),
                           'grid_params': {
                                          'lr__C': [0.001, 0.05, 0.1, 1],
                                          'lr__solver': ['liblinear']
                                         }
        },
        'RandomForest Classifier': 
                          {'pipeline': ('rfc', RandomForestClassifier()),
                          'grid_params': {
                                          'rfc__min_samples_split': [2 ,4, 8, 10],
                                          'rfc__min_samples_leaf': [2 ,4, 8, 10],
                                          'rfc__max_depth': [None, 1, 5 ,10, 15]
                                         }
        },
}

datas = {
    'WordNetLemmatizer': {'X_train': X_lemma_train, 'X_test': X_lemma_test, 
                          'y_train': y_lemma_train, 'y_test': y_lemma_test},
    'PorterStemmer' :    {'X_train': X_stem_train, 'X_test': X_stem_test, 
                          'y_train': y_stem_train, 'y_test': y_stem_test}    
}

In [15]:
# combine pipeline and gridsearch params
vec_models = {}
for vec_name, vector in vectors.items():
    for cls_name, model in models.items():
        vec_models[(vec_name, cls_name)] = {
            'pipeline':   Pipeline([
                vector['pipeline'],
                model['pipeline'],
            ]),
            'grid_params': {
                **vector['grid_params'],
                **model['grid_params'],
            }
        }

In [16]:
# run gridsearch
out_models = {}
i = 1
# initialize DataFrame to store score output 
model_df = pd.DataFrame(columns=['Word_Normalizer', 'Vertorizer_name', 'Model_Name', 'CV_Score', 'Training_Score', 'Testing_Score'])
for group_name, data in datas.items():
    X_train, X_test, y_train, y_test = data['X_train'], data['X_test'], data['y_train'], data['y_test']
    for name, model in vec_models.items():
        vec_name, cls_name = name
        key = (vec_name, cls_name, group_name)
        print(f"{i}. Running gridsearch for ({vec_name}, {cls_name}, {group_name})")
        out_models[key] = GridSearchCV(estimator=model['pipeline'], param_grid=model['grid_params'], \
                            scoring='accuracy', n_jobs=10, cv=2)
        out_models[key].fit(X_train, y_train)
        
        # add store for model
        model_df.loc[i] = [group_name, vec_name, cls_name, out_models[key].best_score_, 
                            out_models[key].score(X_train, y_train), out_models[key].score(X_test, y_test)]
        
        # display model result
        print(f"Best parameters is : {out_models[key].best_params_}")
        print(f"Best cross validation score is : {out_models[key].best_score_:.8f}")
        print(f"Best training score is : {out_models[key].score(X_train, y_train):.8f}")
        print(f"Best testing score is : {out_models[key].score(X_test, y_test):.8f}")
        print('-' * 80)
        i += 1

1. Running gridsearch for (CountVectorizer, Naive Bayes, WordNetLemmatizer)
Best parameters is : {'nb__alpha': 0.1, 'vec__min_df': 2, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}
Best cross validation score is : 0.90354733
Best training score is : 0.97137523
Best testing score is : 0.90298507
--------------------------------------------------------------------------------
2. Running gridsearch for (CountVectorizer, Logistic Regression, WordNetLemmatizer)
Best parameters is : {'lr__C': 0.1, 'lr__solver': 'liblinear', 'vec__min_df': 2, 'vec__ngram_range': (1, 1), 'vec__stop_words': 'english'}
Best cross validation score is : 0.87617021
Best training score is : 0.98133167
Best testing score is : 0.89054726
--------------------------------------------------------------------------------
3. Running gridsearch for (CountVectorizer, RandomForest Classifier, WordNetLemmatizer)
Best parameters is : {'rfc__max_depth': 15, 'rfc__min_samples_leaf': 2, 'rfc__min_samples_split': 4, 'vec__min

In [17]:
# display model scores
model_df['Score_Difference'] = model_df['Training_Score'] - model_df['Testing_Score']
model_df.sort_values(by='CV_Score', ascending=False)

Unnamed: 0,Word_Normalizer,Vertorizer_name,Model_Name,CV_Score,Training_Score,Testing_Score,Score_Difference
1,WordNetLemmatizer,CountVectorizer,Naive Bayes,0.903547,0.971375,0.902985,0.06839
4,WordNetLemmatizer,TfidfVectorizer,Naive Bayes,0.899815,0.987554,0.910448,0.077107
7,PorterStemmer,CountVectorizer,Naive Bayes,0.896078,0.981954,0.905473,0.076481
10,PorterStemmer,TfidfVectorizer,Naive Bayes,0.89297,0.989421,0.91791,0.071511
5,WordNetLemmatizer,TfidfVectorizer,Logistic Regression,0.890481,0.98631,0.910448,0.075862
11,PorterStemmer,TfidfVectorizer,Logistic Regression,0.889856,0.974487,0.89801,0.076477
3,WordNetLemmatizer,CountVectorizer,RandomForest Classifier,0.876784,0.922215,0.868159,0.054056
2,WordNetLemmatizer,CountVectorizer,Logistic Regression,0.87617,0.981332,0.890547,0.090784
9,PorterStemmer,CountVectorizer,RandomForest Classifier,0.875538,0.937772,0.858209,0.079563
6,WordNetLemmatizer,TfidfVectorizer,RandomForest Classifier,0.872437,0.910392,0.870647,0.039745


#### Analyze models performance

We use `PorterStemmer` for word normalization and `CountVectorizer` for feature extraction with `Naive Bayes` (model #7).
- Best parameters is : {'nb__alpha': 0.1, 'vec__min_df': 2, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}

While the CV score is nearly identical to the best model #1, the low difference between training and testing scores suggests it is not overfitting, unlike model #1.

In [45]:
# select model
model = out_models[('CountVectorizer', 'Naive Bayes', 'WordNetLemmatizer')].best_estimator_

In [47]:
# get data
data = datas['PorterStemmer']
X_train, X_test, y_train, y_test = data['X_train'], data['X_test'], data['y_train'], data['y_test']
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [49]:
# get classification report
score = classification_report(y_test, y_pred, output_dict=True)
score_df = pd.DataFrame(score).T

In [51]:
# dispkay baseline
y.value_counts(normalize=True).round(2)
# classes are well-balanced.

subreddit
wine    0.53
beer    0.47
Name: proportion, dtype: float64

In [53]:
# display classification report
score_df

Unnamed: 0,precision,recall,f1-score,support
beer,0.868293,0.946809,0.905852,188.0
wine,0.949239,0.873832,0.909976,214.0
accuracy,0.90796,0.90796,0.90796,0.90796
macro avg,0.908766,0.91032,0.907914,402.0
weighted avg,0.911383,0.90796,0.908047,402.0


It's a good classification model has high precision, recall, and F1-scores for each class.