### Comparing Models and Vectorization Strategies for Text Classification

This notebook focuses on weighing the positives and negatives of different estimators and vectorization strategies for a text classification problem.

### The Data

The dataset below is from [kaggle]() and contains a dataset named the "ColBert Dataset" created for this [paper](https://arxiv.org/pdf/2004.12765.pdf).  You are to use the text column to classify whether or not the text was humorous.  It is loaded and displayed below.

**Note:** The original dataset contains 200K rows of data. It is best to try to use the full dtaset. If the original dataset is too large for your computer, please use the 'dataset-minimal.csv', which has been reduced to 100K.

In [None]:
import numpy as np
import pandas as pd
import nltk
import warnings
import string

from nltk import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from yellowbrick.classifier import ClassificationReport


warnings.filterwarnings('ignore')

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
df = pd.read_csv('data/dataset.csv')
df['text'] = df['text'].apply(str.lower)

In [None]:
df['humor'] = df['humor'].map({False: 0, True: 1})

In [None]:
df2 = df.copy()

In [None]:
df.head()

In [None]:
df2.head()

#### Task


**Text preprocessing:** As a pre-processing step, perform both `stemming` and `lemmatizing` to normalize your text before classifying. For each technique use both the `CountVectorize`r and `TfidifVectorizer` and use options for stop words and max features to prepare the text data for your estimator.

**Classification:** Once you have prepared the text data with stemming lemmatizing techniques, consider `LogisticRegression`, `DecisionTreeClassifier`, and `MultinomialNB` as classification algorithms for the data. Compare their performance in terms of accuracy and speed.

Share the results of your best classifier in the form of a table with the best version of each estimator, a dictionary of the best parameters and the best score.

### Text Preprocessing

In [None]:
# stemming function & removing stop words & punctuation
def stemmer(text):
    stemmer = PorterStemmer()
    swords = stopwords.words('english')
    punctuation = string.punctuation
    return ' '.join([stemmer.stem(w) for w in word_tokenize(text) if ((w not in swords) and (w not in punctuation))])

In [None]:
df['text'] = df['text'].apply(stemmer)
y = df['humor']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, random_state=42)

In [None]:
X_train.head()

In [None]:
cv_s = CountVectorizer(stop_words='english')
cv_s_train = cv_s.fit_transform(X_train)
cv_s_test = cv_s.transform(X_test)

In [None]:
tfidf_s = TfidfVectorizer(stop_words='english')
tfidf_s_train = tfidf_s.fit_transform(X_train)
tfidf_s_test = tfidf_s.transform(X_test)
print(tfidf_s.get_feature_names_out()[:10])

In [None]:
cv_s_df = pd.DataFrame(cv_s_train.toarray(), columns=cv_s.get_feature_names_out())
tfidf_s_df = pd.DataFrame(tfidf_s_train.toarray(), columns=tfidf_s.get_feature_names_out())

print(cv_s_df.shape)
print(tfidf_s_df.shape)

# Check whether the DataFrames are equal
print(cv_s_df.equals(tfidf_s_df))

In [None]:
# lemmatizing function & removing stop words
def lemmatizer(text):
    lemma = WordNetLemmatizer()
    swords = stopwords.words('english')
    punctuation = string.punctuation
    return ' '.join([lemma.lemmatize(w) for w in word_tokenize(text) if ((w not in swords) and (w not in punctuation))
                    ])

In [None]:
df2['text'] = df2['text'].apply(lemmatizer)
y = df2['humor']

X_train, X_test, y_train, y_test = train_test_split(df2['text'], y, random_state=42)

In [None]:
cv_l = CountVectorizer(stop_words='english', max_features = 100)
cv_l_train = cv_l.fit_transform(X_train)
cv_l_test = cv_l.transform(X_test)

In [None]:
tfidf_l = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_l_train = tfidf_l.fit_transform(X_train)
tfidf_l_test = tfidf_l.transform(X_test)
print(tfidf_l.get_feature_names_out()[:10])

In [None]:
cv_l_df = pd.DataFrame(cv_l_train.toarray(), columns=cv_l.get_feature_names_out())
tfidf_l_df = pd.DataFrame(tfidf_l_train.toarray(), columns=tfidf_l.get_feature_names_out())

print(cv_l_df.shape)
print(tfidf_l_df.shape)

# Check whether the DataFrames are equal
print(cv_l_df.equals(tfidf_l_df))

# Classification Models

In [None]:
logreg_pipe = Pipeline([('lr', LogisticRegression(random_state=42, max_iter = 1000))])
dtc_pipe = Pipeline([('dtc', DecisionTreeClassifier(random_state=42))])

params_logreg = {
    'lr__penalty' : ['l1','l2'], 
    'lr__C'       : np.logspace(-3,3,7),
    'lr__solver'  : ['newton-cg', 'lbfgs', 'liblinear']
}

params_dtc = {
    'dtc__min_impurity_decrease': [0.01, 0.02, 0.03, 0.05],
    'dtc__max_depth': [2, 5, 10],
    'dtc__min_samples_split': [0.1, 0.2, 0.05]
}

In [None]:
def calculate_performance(pipe, params, X_train, y_train, X_test, y_test):
    grid = GridSearchCV(pipe, 
                        param_grid=params, 
                        scoring='accuracy').fit(X_train, y_train)
    
    best_params = grid.best_params_
    best_score = grid.score(X_test, y_test)
    return {'best_params':best_params, 'best_score':best_score}  

# LogisticRegression

In [None]:
#logistic regression & CV (stemming)
lr_cv_s_dict = calculate_performance(logreg_pipe, params_logreg, cv_s_train, y_train, cv_s_test, y_test)
lr_cv_s_best_params = lr_cv_s_dict['best_params']
lr_cv_s_best_score = lr_cv_s_dict['best_score']

In [None]:
#logistic regression & CV (lemmatizing)
lr_cv_l_dict = calculate_performance(logreg_pipe, params_logreg, cv_l_train, y_train, cv_l_test, y_test)
lr_cv_l_best_params = lr_cv_l_dict['best_params']
lr_cv_l_best_score = lr_cv_l_dict['best_score']

In [None]:
#logistic regression & TFIDF (stemming)
lr_tfidf_s_dict = calculate_performance(logreg_pipe, params_logreg, tfidf_s_train, y_train, tfidf_s_test, y_test)
lr_tfidf_s_best_params = lr_tfidf_s_dict['best_params']
lr_tfidf_s_best_score = lr_tfidf_s_dict['best_score']

In [None]:
#logistic regression & TFIDF (lemmatizing)
lr_tfidf_l_dict = calculate_performance(logreg_pipe, params_logreg, tfidf_l_train, y_train, tfidf_l_test, y_test)
lr_tfidf_l_best_params = lr_tfidf_l_dict['best_params']
lr_tfidf_l_best_score = lr_tfidf_l_dict['best_score']

# DecisionTreeClassifier

In [None]:
# decision tree classifier & CV (stemming)
dtc_cv_s_dict = calculate_performance(dtc_pipe, params_dtc, cv_s_train, y_train, cv_s_test, y_test)
dtc_cv_s_best_params = dtc_cv_s_dict['best_params']
dtc_cv_s_best_score = dtc_cv_s_dict['best_score']

In [None]:
# decision tree classifier & CV (lemmatizing)
dtc_cv_l_dict = calculate_performance(dtc_pipe, params_dtc, cv_l_train, y_train, cv_l_test, y_test)
dtc_cv_l_best_params = dtc_cv_l_dict['best_params']
dtc_cv_l_best_score = dtc_cv_l_dict['best_score']

In [None]:
# decision tree classifier & TFIDF (stemming)
dtc_tfidf_s_dict = calculate_performance(dtc_pipe, params_dtc, tfidf_s_train, y_train, tfidf_s_test, y_test)
dtc_tfidf_s_best_params = dtc_tfidf_s_dict['best_params']
dtc_tfidf_s_best_score = dtc_tfidf_s_dict['best_score']

In [None]:
# decision tree classifier & TFIDF (lemmatizing)
dtc_tfidf_l_dict = calculate_performance(dtc_pipe, params_dtc, tfidf_l_train, y_train, tfidf_l_test, y_test)
dtc_tfidf_l_best_params = dtc_tfidf_l_dict['best_params']
dtc_tfidf_l_best_score = dtc_tfidf_l_dict['best_score']

# MultinomialNB

In [None]:
params_nb = {
    'nb__alpha' : np.arange(0, 1, .1)
}
nb_pipe = Pipeline([('nb', MultinomialNB())])

def train_and_predict(X,y):
    grid = GridSearchCV(nb_pipe, 
                        param_grid=params_nb, 
                        scoring='accuracy').fit(X, y)
    best_params = grid.best_params_
    best_score = grid.score(X, y)
    return {'best_params':best_params, 'best_score':best_score}  

In [None]:
nb_cv_s_dict=train_and_predict(cv_s_train, y_train)
nb_cv_s_best_params = nb_cv_s_dict['best_params']
nb_cv_s_best_score = nb_cv_s_dict['best_score']

In [None]:
nb_cv_l_dict=train_and_predict(cv_l_train, y_train)
nb_cv_l_best_params = nb_cv_l_dict['best_params']
nb_cv_l_best_score = nb_cv_l_dict['best_score']

In [None]:
nb_tfidf_s_dict=train_and_predict(tfidf_s_train, y_train)
nb_tfidf_s_best_params = nb_tfidf_s_dict['best_params']
nb_tfidf_s_best_score = nb_tfidf_s_dict['best_score']

In [None]:
nb_tfidf_l_dict=train_and_predict(tfidf_l_train, y_train)
nb_tfidf_l_best_params = nb_tfidf_l_dict['best_params']
nb_tfidf_l_best_score = nb_tfidf_l_dict['best_score']

### Findings

In [None]:
results_df = pd.DataFrame(
    {
        'Model': ['LogisticRegression',' ',' ',' ',' ','DecisionTreeClassifier',' ',' ',' ',' ','MultinomalNB',' ',' ',' ',' '],
        'Prep & Classification' : ['','CountVectorizer & Stemming','CountVectorizer & Lemmatization','TFIDF & Stemming','TFIDF & Lemmatization',
                                  '','CountVectorizer & Stemming','CountVectorizer & Lemmatization','TFIDF & Stemming','TFIDF & Lemmatization',
                                  '','CountVectorizer & Stemming','CountVectorizer & Lemmatization','TFIDF & Stemming','TFIDF & Lemmatization'],
        'Best Params': ['',lr_cv_s_best_params,lr_cv_l_best_params,lr_tfidf_s_best_params,lr_tfidf_l_best_params,
                       '',dtc_cv_s_best_params,dtc_cv_l_best_params,dtc_tfidf_s_best_params,dtc_tfidf_l_best_params,
                        '',nb_cv_s_best_params,nb_cv_l_best_params,nb_tfidf_s_best_params,nb_tfidf_l_best_params],
        'Best Score': ['',lr_cv_s_best_score,lr_cv_l_best_score,lr_tfidf_s_best_score,lr_tfidf_l_best_score,
                      '',dtc_cv_s_best_score,dtc_cv_l_best_score,dtc_tfidf_s_best_score,dtc_tfidf_l_best_score,
                      '',nb_cv_s_best_score,nb_cv_l_best_score,nb_tfidf_s_best_score,nb_tfidf_l_best_score]
    }
).set_index('Model')

In [None]:
pd.set_option("display.max_colwidth", 10000)
results_df.replace(np.nan,'',regex=True)

### Scoring the top 2 classifiers

In [None]:
# predicting with best logisticregression model - TDIF & lemmatization
best_lr_model = LogisticRegression(C=10, penalty='l2', solver='lbfgs', random_state=42, max_iter = 1000)
best_lr_model.fit(tfidf_l_train, y_train)

In [None]:
lr_predictions = best_lr_model.predict(tfidf_l_test)
lr_score = best_lr_model.score(tfidf_l_test, y_test)

In [None]:
print(lr_predictions)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(best_lr_model, tfidf_l_test, y_test)

In [None]:
# predicting with best MultinomialNB model - TDIF & lemmatization
nb_model =  MultinomialNB(alpha=0.7)
nb_model.fit(tfidf_l_train, y_train)

In [None]:
nb_predictions = nb_model.predict(tfidf_l_test)
print(nb_predictions)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(nb_model, tfidf_l_test, y_test)