# Initialisation

In [31]:
# @title Install Packages

!pip install -qq ordered_set

In [32]:
# @title Mount Google Drive for Credentials

from google.colab import drive
drive.mount("/content/drive")
!rm -r -f /content/sample_data
!cp -r /content/drive/MyDrive/.kaggle ~

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
# @title Downloads

# nltk
import nltk
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

# imdb sentiment dataset
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!mkdir /content/data
!mv ./imdb-dataset-of-50k-movie-reviews.zip /content/data/imdb-dataset-of-50k-movie-reviews.zip
!unzip -qq /content/data/imdb-dataset-of-50k-movie-reviews.zip -d /content/data/imdb-dataset-of-50k-movie-reviews

# model weights
!kaggle datasets download -d tharushalekamge/models
!mkdir /content/models-weights
!mv ./models.zip /content/models-weights/models.zip
!unzip -qq /content/models-weights/models.zip -d /content/models-weights

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
  0% 0.00/25.7M [00:00<?, ?B/s] 51% 13.0M/25.7M [00:00<00:00, 134MB/s]
100% 25.7M/25.7M [00:00<00:00, 176MB/s]
mkdir: cannot create directory ‘/content/data’: File exists
replace /content/data/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: Downloading models.zip to /content
 99% 16.0M/16.2M [00:00<00:00, 167MB/s]
100% 16.2M/16.2M [00:00<00:00, 168MB/s]
mkdir: cannot create directory ‘/content/models-weights’: File exists
replace /content/models-weights/grid_imdb_knn.pickle? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [34]:
# @title Static paths

dataset_csv_path = "/content/data/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
model_weights_dir = "/content/models-weights"

# Create dataset

In [35]:
# @title Module Imports

import time

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import ParameterGrid
from sklearn.svm import SVC
import sklearn.feature_extraction
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from bs4 import BeautifulSoup
import re
import pickle
import seaborn as sns

from ordered_set import OrderedSet
from scipy.sparse import lil_matrix
from itertools import compress

In [36]:
# @title Dataset definition

class IMDBDataset:
  def _strip_html(self, text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

  def _remove_special_characters(self, text, remove_digits=True):
      pattern=r'[^a-zA-z0-9\s]'
      text=re.sub(pattern,'',text)
      return text

  def _remove_stopwords(self, text, is_lower_case=False):
      tokens = self.tokenizer.tokenize(text)
      tokens = [token.strip() for token in tokens]
      if is_lower_case:
          filtered_tokens = [token for token in tokens if token not in self.stop_words]
      else:
          filtered_tokens = [token for token in tokens if token.lower() not in self.stop_words]
      filtered_text = ' '.join(filtered_tokens)
      return filtered_text

  def _lemmatize_text(self, text):
      words=word_tokenize(text)
      edited_text = ''
      for word in words:
          lemma_word=self.lemmatizer.lemmatize(word)
          extra=" "+str(lemma_word)
          edited_text+=extra
      return edited_text

  def __init__(self, stop_words, tokenizer, lemmatizer, loaded_vectorizer, label_binarizer, dataset_csv_path):
    self.stop_words = stop_words
    self.tokenizer = tokenizer
    self.lemmatizer = lemmatizer

    ## Import
    data = pd.read_csv(dataset_csv_path)
    data = data.sample(10000)

    ## Preprocess
    data.review = data.review.str.lower()
    data.review = data.review.apply(self._strip_html)
    data.review = data.review.apply(self._remove_special_characters)
    data.review = data.review.apply(self._remove_stopwords)
    data.review = data.review.apply(self._lemmatize_text)

    ## Split Data
    x_imdb = data['review']
    y_imdb = data['sentiment']

    x_train_i, x_test_i, y_train_i, y_test_i = train_test_split(x_imdb,y_imdb,test_size=0.2)
    x_test, x_val, y_test_i, y_val_i = train_test_split(x_test_i,y_test_i,test_size=0.5)

    ## X data
    x_train_imdb = loaded_vectorizer.fit_transform(x_train_i)
    x_test_imdb = loaded_vectorizer.transform(x_test)
    x_val_imdb = loaded_vectorizer.transform(x_val)

    # Y data - Positive is 1
    y_train_imdb = label_binarizer.fit_transform(y_train_i)
    y_test_imdb = label_binarizer.fit_transform(y_test_i)
    y_val_imdb = label_binarizer.fit_transform(y_val_i)

    self.x_train_imdb = x_train_imdb
    self.x_test_imdb = x_test_imdb
    self.x_val_imdb = x_val_imdb
    self.y_train_imdb = y_train_imdb
    self.y_test_imdb = y_test_imdb
    self.y_val_imdb = y_val_imdb



In [37]:
# @title Dataset instantiation

loaded_vocab = pickle.load(open(f'{model_weights_dir}/vectorizer_imdb.pkl', 'rb'))
stop_words = set(stopwords.words('english'))
tokenizer = nltk.tokenize.toktok.ToktokTokenizer()
lemmatizer = WordNetLemmatizer()
loaded_vectorizer = TfidfVectorizer(min_df=2, vocabulary=loaded_vocab)
label_binarizer = sklearn.preprocessing.LabelBinarizer()
feature_names = loaded_vectorizer.get_feature_names_out()

ds = IMDBDataset(stop_words, tokenizer, lemmatizer, loaded_vectorizer, label_binarizer, dataset_csv_path)

  soup = BeautifulSoup(text, "html.parser")


# Training

## Train RF model

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the grid
grid_rf = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(grid_rf)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
grid_imdb_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions = grid_rf, n_iter = 200, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
# # Fit the random search model
grid_imdb_rf.fit(ds.x_train_imdb, ds.y_train_imdb.ravel())
pickle.dump(grid_imdb_rf, open('grid_imdb_rf.pickle', "wb"))

## Train SVC Model

In [131]:
# Param Optimisation
param_grid_imdb = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf']}
grid_imdb_svc = GridSearchCV(SVC(probability=True),param_grid_imdb,refit=True,verbose=2)

In [None]:
grid_imdb_svc.fit(ds.x_train_imdb,ds.y_train_imdb.ravel())
pickle.dump(grid_imdb_svc, open('grid_imdb_svc.pickle', "wb"))

Fitting 5 folds for each of 16 candidates, totalling 80 fits


## Train KNN model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
grid_params_imdb_knn = { 'n_neighbors' : [30,40,50,60,70,80,90], 'metric' : ['manhattan', 'minkowski'], 'weights': ['uniform', 'distance']}
grid_imdb_knn = GridSearchCV(KNeighborsClassifier(), grid_params_imdb_knn, n_jobs=-1,verbose=2)

In [None]:
grid_imdb_knn.fit(ds.x_train_imdb,np.ravel(ds.y_train_imdb,order='C'))
pickle.dump(grid_imdb_knn, open('grid_imdb_knn.pickle', "wb"))

## Train LR Model

In [None]:
from sklearn.linear_model import LogisticRegression
param_grid_imdb_lr = [
    {'penalty' : ['l1', 'l2', 'elasticnet'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','sag'],
    'max_iter' : [100, 1000, 5000]
    }
]
grid_imdb_lr = GridSearchCV(LogisticRegression(), param_grid = param_grid_imdb_lr, cv = 3, verbose=2, n_jobs=-1)

In [None]:
grid_imdb_lr.fit(ds.x_train_imdb, np.ravel(ds.y_train_imdb,order='C'))
pickle.dump(grid_imdb_lr, open('grid_imdb_lr.pickle', "wb"))

# Load Models

In [38]:
# Load
loaded_svc_imdb = pickle.load(open(f'{model_weights_dir}/grid_imdb_svc.pickle', "rb"))
loaded_lr_imdb = pickle.load(open(f'{model_weights_dir}/grid_imdb_lr.pickle', "rb"))
loaded_rf_imdb = pickle.load(open(f'{model_weights_dir}/grid_imdb_rf.pickle', "rb"))
loaded_knn_imdb = pickle.load(open(f'{model_weights_dir}/grid_imdb_knn.pickle', "rb"))

  loaded_svc_imdb = pickle.load(open(f'{model_weights_dir}/grid_imdb_svc.pickle', "rb"))
  loaded_knn_imdb = pickle.load(open(f'{model_weights_dir}/grid_imdb_knn.pickle', "rb"))


In [39]:
print(loaded_svc_imdb.best_params_)
print(loaded_lr_imdb.best_params_)
print(loaded_rf_imdb.best_params_)
print(loaded_knn_imdb.best_params_)

{'C': 10, 'gamma': 1, 'kernel': 'rbf'}
{'C': 4.281332398719396, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': None, 'bootstrap': False}
{'metric': 'minkowski', 'n_neighbors': 90, 'weights': 'distance'}
