## In this notebook, we will tune the various ML algorithms for better performance

# Conversion of text data into representations readable by ML algorithms

Before we utitlize machine learning models for prediction, we need to convert the text data into a form that ML models can read.
Here are some forms we will explore:

1. Bag of Words representation
2. TF-IDF representation
3. Word2Vec representation of text
4. GloVe representation of text

In [None]:
%pip install gensim
%pip install scipy==1.12.0
%pip install xgboost

# Importing libraries

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.data import find
import gensim
import nltk
from gensim.models import KeyedVectors, Word2Vec
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler

# Reading and preprocessing of data

In [None]:
train_df = pd.read_csv('data/ML/ML_train.csv')
test_df = pd.read_csv('data/ML/ML_test.csv')

In [None]:
X_train = train_df['text']
y_train = train_df['humor']
X_test = test_df['text']
y_test = test_df['humor']

Bag of Words

In [None]:
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(X_train)

bow_X_train = bow_vectorizer.transform(X_train)
bow_X_test = bow_vectorizer.transform(X_test)

TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X_train)

tfidf_X_train = tfidf_vectorizer.transform(X_train)
tfidf_X_test = tfidf_vectorizer.transform(X_test)

Word2Vec: Using a pre-trained model

In [None]:
import gensim.downloader as api

path = api.load('word2vec-google-news-300', return_path=True)

In [None]:
model = KeyedVectors.load_word2vec_format(path, binary=True)

In [None]:
# as each piece of text has a different length, we need to use a function to average the
# vector representation of each word in the vector, so that every vector will have the same length

def sent_vec(sent, model):
    vector_size = model.vector_size
    wv_res = np.zeros(vector_size)
    ctr = 1
    for w in sent:
      if w in model:
        ctr += 1
        wv_res += model[w]
    wv_res = wv_res / ctr
    return wv_res

In [None]:
# since we cleaned our text data previously, we only need to convert the text in each row into lists, where each element is a token

def split_text(text):
  return text.split()

In [None]:
train_df['tokens'] = train_df['text'].apply(split_text)
train_df.head()

In [None]:
test_df['tokens'] = test_df['text'].apply(split_text)
test_df.head()

In [None]:
train_df['w2v'] = train_df['tokens'].apply(lambda x: sent_vec(x, model))
train_df.head()

In [None]:
test_df['w2v'] = test_df['tokens'].apply(lambda x: sent_vec(x, model))
test_df.head()

In [None]:
w2v_X_train = train_df['w2v'].to_list()
w2v_X_test = test_df['w2v'].to_list()

GloVe: We will use a pretrained model as well

In [None]:
model = api.load('glove-twitter-50')

In [None]:
train_df['glove'] = train_df['tokens'].apply(lambda x: sent_vec(x, model))
train_df.head()

In [None]:
test_df['glove'] = test_df['tokens'].apply(lambda x: sent_vec(x, model))
test_df.head()

In [None]:
glove_X_train = train_df['glove'].to_list()
glove_X_test = test_df['glove'].to_list()

In [None]:
glove_X_train[0]

# Function to test the models

In [None]:
def train_and_eval(model, trainX, trainY, testX, testY):

    # training the model
    fitted_model = model.fit(trainX, trainY)

    # getting predictions
    y_preds_train = fitted_model.predict(trainX)
    y_preds_test = fitted_model.predict(testX)

    # evaluating the model
    print()
    print(model)
    print(f"Train accuracy score : {accuracy_score(trainY, y_preds_train)}")
    print(f"Test accuracy score : {accuracy_score(testY, y_preds_test)}")
    print(classification_report(testY, y_preds_test))
    print('\n',40*'-')