## 0. Importing necessary libraries

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfTransformer
from keras.preprocessing.text import text_to_word_sequence
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score

from xgboost import XGBClassifier

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. Prepare data

In [4]:
import sys
sys.path.insert(0, '/content/drive/MyDrive/ML/preprocessing')
import Preprocessor

### 1.1. Load data

In [5]:
import tensorflow_datasets as tfds
dataset = tfds.load('imdb_reviews', as_supervised=False)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteH5D1WI/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteH5D1WI/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteH5D1WI/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [6]:
X_train, y_train, X_test, y_test = Preprocessor.load_prep(dataset)  

### 1.2. TF-IDF Vectorisation

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

def func(doc):  # n_jobs in GridSearch does not support lambda function
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=func,
    preprocessor=func) 

tfidf.fit(X_train)

  "The parameter 'token_pattern' will not be used"


TfidfVectorizer(preprocessor=<function func at 0x7f72f9e39050>,
                tokenizer=<function func at 0x7f72f9e39050>)

In [8]:
X_train_tf = tfidf.transform(X_train)
X_test_tf = tfidf.transform(X_test)

## 2. Model training

### 2.1. Default hyperparameter values

In [9]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [None]:
rf = RandomForestClassifier(random_state=0)
lr = LogisticRegression(random_state=0)
svm = LinearSVC(random_state=0)

for clf in(rf, lr, svm):
  clf.fit(X_train_tf, y_train)
  y_pred_train = clf.predict(X_train_tf)
  y_pred_test = clf.predict(X_test_tf)

  train_acc = accuracy_score(y_train, y_pred_train)
  test_acc = accuracy_score(y_test, y_pred_test)
  print("classifier: ", clf.__class__.__name__)
  print("train: ", train_acc)
  print("test: ", test_acc)

classifier:  RandomForestClassifier
train:  1.0
test:  0.84704
classifier:  LogisticRegression
train:  0.93748
test:  0.88176
classifier:  LinearSVC
train:  0.99408
test:  0.87224


In [None]:
clf = VotingClassifier(
    estimators=[('rf', rf), ('lr', lr), ('svm', svm)],
    voting='soft'
)
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
print("classifier: ", clf.__class__.__name__)
print("train: ", train_acc)
print("test: ", test_acc)

classifier:  VotingClassifier
train:  0.99408
test:  0.87224


## 2.2. Hyperparameter tuning

### 2.2.1. Random Forest

We tried tuning `n_estimators`, `max_depth` to increase the accuracy of the model. However, the tuning does not seem to help much, thus we kept default values for these. The `criterion` hypyerparameter is tuned to yield *entropy* as the best.

In [15]:
rf = RandomForestClassifier(random_state=0, criterion='entropy')
rf.fit(X_train_tf, y_train)

y_pred_train = lr.predict(X_train_tf)
y_pred_test = lr.predict(X_test_tf)

train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
print("classifier: Logistic Regression")
print("train: ", train_acc)
print("test: ", test_acc)

classifier: Logistic Regression
train:  0.9574
test:  0.88332


### 2.2.2. Logistic Regression

After tuning using GridSearchCV, we yield that `C` makes the most significant impact on the accuracy, with the optimal value of 2.

In [13]:
lr = LogisticRegression(C=2)
lr.fit(X_train_tf, y_train)

y_pred_train = lr.predict(X_train_tf)
y_pred_test = lr.predict(X_test_tf)

train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
print("classifier: Logistic Regression")
print("train: ", train_acc)
print("test: ", test_acc)

classifier: Logistic Regression
train:  0.9574
test:  0.88332


### 2.2.3. Voting Classifier

After tuning the base classifiers, Voting Classifier is called again to see the difference.

In [16]:
rf = RandomForestClassifier(random_state=0, criterion='entropy')
lr = LogisticRegression(C=2, random_state=0)
svm = LinearSVC(C=.5, random_state=0)

clf = VotingClassifier(
    estimators=[('rf', rf), ('lr', lr), ('svm', svm)],
    voting='soft'
)
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)
print("classifier: ", clf.__class__.__name__)
print("train: ", train_acc)
print("test: ", test_acc)

classifier:  VotingClassifier
train:  0.9574
test:  0.88332
