In [36]:
# Import libs
import warnings
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC

warnings.filterwarnings(action="ignore")
import os
import re
import nltk
import sys

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
import en_core_web_sm

en_core_web_sm.load()
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load Dataset

In [6]:
# Data file path
PATH = "E:\\NLP\\Final\\aclImdb"
sys.getdefaultencoding()

'utf-8'

In [7]:
# Separating train files to positive and negative
posFiles = [x for x in os.listdir(PATH + "/train/pos/") if x.endswith(".txt")]
negFiles = [x for x in os.listdir(PATH + "/train/neg/") if x.endswith(".txt")]

In [10]:
# Separating test files to positive and negative
test_pos_Files = [x for x in os.listdir(PATH + "/test/pos/") if x.endswith(".txt")]
test_neg_Files = [x for x in os.listdir(PATH + "/test/neg/") if x.endswith(".txt")]

In [8]:
P_train = []
N_train = []

for nfile in negFiles:
    with open(PATH + "/train/neg/" + nfile, encoding="utf-8") as f:
        N_train.append(f.read())
        
for pfile in posFiles:
    with open(PATH + "/train/pos/" + pfile, encoding="utf-8") as f:
        P_train.append(f.read())

In [11]:
P_test = []
N_test = []

for ptestfile in test_pos_Files:
    with open(PATH + "/test/pos/" + ptestfile, encoding="utf-8") as f:
        P_test.append(f.read())
        
for ntestfile in test_neg_Files:
    with open(PATH + "/test/neg/" + ntestfile, encoding="utf-8") as f:
        N_test.append(f.read())

In [12]:
reviews_train = pd.concat([
    pd.DataFrame({"review": P_train, "Label": 1, "file": posFiles}),
    pd.DataFrame({"review": N_train, "Label": -1, "file": negFiles})
], ignore_index=True).sample(frac=1, random_state=1)

reviews_test = pd.concat([
    pd.DataFrame({"review": P_test, "Label": 1, "file": test_pos_Files}),
    pd.DataFrame({"review": N_test, "Label": -1, "file": test_neg_Files})
], ignore_index=True).sample(frac=1, random_state=1)

In [13]:
reviews_train

Unnamed: 0,review,Label,file
21492,"I have copy of this on VHS, I think they (The ...",-1,6844_1.txt
9488,After several extremely well ratings to the po...,1,7290_10.txt
16933,I still don't know why I forced myself to sit ...,-1,2740_1.txt
12604,Mt little sister and I are self-proclaimed hor...,-1,10094_1.txt
8222,I have personally seen many Disney movies in m...,1,6150_7.txt
...,...,...,...
10955,Diane Keaton gave an outstanding performance i...,1,8610_10.txt
17289,"This has to be creepiest, most twisted holiday...",-1,3060_1.txt
5192,"Do not expect a depiction of the ""truth"". Howe...",1,3423_7.txt
12172,The League of Gentlemen is one of the funniest...,1,9706_10.txt


In [16]:
reviews_test

Unnamed: 0,review,Label,file
21492,A movie theater with a bad history of past gru...,-1,6844_2.txt
9488,"""Here On Earth"" is a surprising beautiful roma...",1,7290_10.txt
16933,I just watched Descent. Gawds what an awful mo...,-1,2740_3.txt
12604,In a nutshell the movie is about a gang war in...,-1,10094_4.txt
8222,"Instead of watching the recycled history of ""P...",1,6150_7.txt
...,...,...,...
10955,This movie is a fascinating drama about the Ma...,1,8610_8.txt
17289,"It's too kind to call this a ""fictionalized"" a...",-1,3060_3.txt
5192,I was unsure of this movie before renting and ...,1,3423_9.txt
12172,"Just got out of an advance screening, and wow ...",1,9706_7.txt


# Data Preprocessing

Here we remove html tags, urls, special characters,Lemmanatize-
which is better than stemming as it gives a proper word after cutting

In [20]:
stopWords = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [21]:
def rmvhtmltags(text):
    remreg = re.compile('<.*?>')
    cleartext = re.sub(remreg, '', text)
    return text

def remove_urls(vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return (vTEXT)

def rmvspclcharacter(text):
    clearspcl = re.sub(r'[^A-Za-z0-9\s.]', r'', str(text).lower())
    clearspcl = re.sub(r'\n', r' ', text)
    clearspcl = " ".join([word for word in text.split() if word not in stopWords])
    return text


def lemmatize_words(text):
    lemmatized_words = [lemmatizer.lemmatize(word, 'v') for word in text.split()]
    return ('  '.join(lemmatized_words))


# A function dataprocessing is defined where all other functions are included
def dataprocessing(x):
    x = rmvhtmltags(x)
    x = remove_urls(x)
    x = x.lower()
    x = rmvspclcharacter(x)
    x = remove_stopwords(x)
    x = strip_punctuation(x)
    x = strip_multiple_whitespaces(x)
    x = lemmatize_words(x)

    x = ' '.join([re.sub(r'\d+', '', i) for i in word_tokenize(x)])
    return x

In [22]:
reviews_train['review'] = reviews_train['review'].map(lambda x: dataprocessing(x))
reviews_test['review'] = reviews_test['review'].map(lambda x: dataprocessing(x))

In [23]:
reviews_train[:5]

Unnamed: 0,review,Label,file
21492,copy vhs think the television network play yea...,-1,6844_1.txt
9488,extremely rat point superb extremely please fi...,1,7290_10.txt
16933,don t know force sit thing film wasn t worth m...,-1,2740_1.txt
12604,mt little sister self proclaim horror movie bu...,-1,10094_1.txt
8222,personally see disney movies lifetime absolute...,1,6150_7.txt


# Model Training

In [24]:
# separating them into lists
y_train_label = reviews_train['Label'].tolist()
x_train_review = reviews_train['review'].tolist()

y_test_label = reviews_test['Label'].tolist()
x_test_review = reviews_test['review'].tolist()

In [25]:
# Split to train and test data
X_train, X_test, y_train, y_test = train_test_split(x_train_review, y_train_label, test_size=0.3, random_state=42)

In [27]:
# Logistic Regression 
model_lr = Pipeline([('vect', CountVectorizer()),
                  ('clf', LogisticRegression()), ])
model_lr = model_lr.fit(X_train, y_train)

print("Cross Validation for Logistic regression on Count Vectorizer")
cross_val_score(model_lr, X_train, y_train, cv=3)

Cross Validation for Logistic regression on Count Vectorizer


array([0.86630099, 0.86696383, 0.87004972])

In [28]:
# Support Vector Machine 
model_svm = Pipeline([('vect', CountVectorizer()),
                     ('clf', LinearSVC()), ])
model_svm = model_svm.fit(X_train, y_train)

print("Cross Validation for SVM on Count Vectorizer")
cross_val_score(model_svm, X_train, y_train, cv=3)

Cross Validation for SVM on Count Vectorizer


array([0.8453891 , 0.84587691, 0.85067718])

In [29]:
# Naive Bayes
model_nb = Pipeline([('vect', CountVectorizer()),
                    ('clf', MultinomialNB()), ])
model_nb = model_nb.fit(X_train, y_train)

print("Cross Validation for Naive Bayes on Count Vectorizer and TFID Transformer")
cross_val_score(model_nb, X_train, y_train, cv=3)

Cross Validation for Naive Bayes on Count Vectorizer and TFID Transformer


array([0.83956119, 0.85393451, 0.84759129])

In [33]:
def model_training(training_model):
    if training_model == 'LR':
        print("Training Logistic regression model using bag of words")
        lrbow = Pipeline([('vect', CountVectorizer()), ('clf', LogisticRegression()), ])
        return lrbow

    elif training_model == 'SVM':
        print("Training SVM model using bag of words")
        svmbow = Pipeline([('vect', CountVectorizer()), ('clf', LinearSVC()), ])
        return svmbow

    elif training_model == 'NB':
        print("Training NB model using bag of words")
        nbbow = Pipeline([('vect', CountVectorizer()), ('clf', MultinomialNB()), ])
        return nbbow
    
def model_fitting_mix():
    training_model = ['LR', 'SVM', 'NB']
    for i in training_model:
        model_mix = model_training(i).fit(X_train, y_train)
        predicted_mix = model_mix.predict(X_test)
        accuracy_mix = np.mean(predicted_mix == y_test)
        scores_mix = cross_val_score(model_mix, X_train, y_train, cv=3)
        print("Accuracy on testing dataset is: {0:0.3f}".format(accuracy_mix))
        print("Accuracy on training dataset is: {0:0.3f}".format(scores_mix.mean()))

In [34]:
model_fitting_mix()

Training Logistic regression model using bag of words
Accuracy on testing dataset is: 0.869
Accuracy on training dataset is: 0.868
Training SVM model using bag of words
Accuracy on testing dataset is: 0.853
Accuracy on training dataset is: 0.847
Training NB model using bag of words
Accuracy on testing dataset is: 0.855
Accuracy on training dataset is: 0.847


# Model Opitimization

In [37]:
text_pipe_logit = make_pipeline(
    CountVectorizer(),
    # for some reason n_jobs > 1 won't work
    # with GridSearchCV's n_jobs > 1
    LogisticRegression(solver="lbfgs", n_jobs=1, random_state=7),
)

text_pipe_logit.fit(X_train, y_train)
print(text_pipe_logit.score(X_test, y_test))

0.8689333333333333


In [38]:
param_grid_logit = {"logisticregression__C": np.logspace(-5, 0, 6)}
grid_logit = GridSearchCV(
    text_pipe_logit, param_grid_logit, return_train_score=True, cv=3, n_jobs=-1
)

grid_logit.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('countvectorizer', CountVectorizer()),
                                       ('logisticregression',
                                        LogisticRegression(n_jobs=1,
                                                           random_state=7))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00])},
             return_train_score=True)

In [39]:
# Best C and cv-score using this hyperparameter
grid_logit.best_params_, grid_logit.best_score_

({'logisticregression__C': 0.1}, 0.8738285216278484)

In [40]:
grid_logit.score(X_test, y_test)

0.8782666666666666