In [33]:
# Import libs
import warnings
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC

warnings.filterwarnings(action="ignore")
import os
import re
import nltk
import sys

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
import en_core_web_sm

en_core_web_sm.load()
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load Dataset

In [2]:
# Data file path
PATH = "E:\\NLP\\Final\\aclImdb"
sys.getdefaultencoding()

'utf-8'

In [3]:
# Separating train files to positive and negative
posFiles = [x for x in os.listdir(PATH + "/train/pos/") if x.endswith(".txt")]
negFiles = [x for x in os.listdir(PATH + "/train/neg/") if x.endswith(".txt")]

In [4]:
# Separating test files to positive and negative
test_pos_Files = [x for x in os.listdir(PATH + "/test/pos/") if x.endswith(".txt")]
test_neg_Files = [x for x in os.listdir(PATH + "/test/neg/") if x.endswith(".txt")]

In [5]:
P_train = []
N_train = []

for nfile in negFiles:
    with open(PATH + "/train/neg/" + nfile, encoding="utf-8") as f:
        N_train.append(f.read())
        
for pfile in posFiles:
    with open(PATH + "/train/pos/" + pfile, encoding="utf-8") as f:
        P_train.append(f.read())

In [6]:
P_test = []
N_test = []

for ptestfile in test_pos_Files:
    with open(PATH + "/test/pos/" + ptestfile, encoding="utf-8") as f:
        P_test.append(f.read())
        
for ntestfile in test_neg_Files:
    with open(PATH + "/test/neg/" + ntestfile, encoding="utf-8") as f:
        N_test.append(f.read())

In [7]:
reviews_train = pd.concat([
    pd.DataFrame({"review": P_train, "Label": 1, "file": posFiles}),
    pd.DataFrame({"review": N_train, "Label": -1, "file": negFiles})
], ignore_index=True).sample(frac=1, random_state=1)

reviews_test = pd.concat([
    pd.DataFrame({"review": P_test, "Label": 1, "file": test_pos_Files}),
    pd.DataFrame({"review": N_test, "Label": -1, "file": test_neg_Files})
], ignore_index=True).sample(frac=1, random_state=1)

In [8]:
reviews_train

Unnamed: 0,review,Label,file
21492,"I have copy of this on VHS, I think they (The ...",-1,6844_1.txt
9488,After several extremely well ratings to the po...,1,7290_10.txt
16933,I still don't know why I forced myself to sit ...,-1,2740_1.txt
12604,Mt little sister and I are self-proclaimed hor...,-1,10094_1.txt
8222,I have personally seen many Disney movies in m...,1,6150_7.txt
...,...,...,...
10955,Diane Keaton gave an outstanding performance i...,1,8610_10.txt
17289,"This has to be creepiest, most twisted holiday...",-1,3060_1.txt
5192,"Do not expect a depiction of the ""truth"". Howe...",1,3423_7.txt
12172,The League of Gentlemen is one of the funniest...,1,9706_10.txt


In [9]:
reviews_test

Unnamed: 0,review,Label,file
21492,A movie theater with a bad history of past gru...,-1,6844_2.txt
9488,"""Here On Earth"" is a surprising beautiful roma...",1,7290_10.txt
16933,I just watched Descent. Gawds what an awful mo...,-1,2740_3.txt
12604,In a nutshell the movie is about a gang war in...,-1,10094_4.txt
8222,"Instead of watching the recycled history of ""P...",1,6150_7.txt
...,...,...,...
10955,This movie is a fascinating drama about the Ma...,1,8610_8.txt
17289,"It's too kind to call this a ""fictionalized"" a...",-1,3060_3.txt
5192,I was unsure of this movie before renting and ...,1,3423_9.txt
12172,"Just got out of an advance screening, and wow ...",1,9706_7.txt


# Data Preprocessing

Here we remove html tags, urls, special characters,Lemmanatize-
which is better than stemming as it gives a proper word after cutting

In [10]:
stopWords = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [11]:
def rmvhtmltags(text):
    remreg = re.compile('<.*?>')
    cleartext = re.sub(remreg, '', text)
    return cleartext

def remove_urls(vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return (vTEXT)

def rmvspclcharacter(text):
    clearspcl = re.sub(r'[^A-Za-z0-9\s.]', r'', str(text).lower())
    clearspcl = re.sub(r'\n', r' ', text)
    clearspcl = " ".join([word for word in text.split() if word not in stopWords])
    return clearspcl


def lemmatize_words(text):
    lemmatized_words = [lemmatizer.lemmatize(word, 'v') for word in text.split()]
    return ('  '.join(lemmatized_words))


# A function dataprocessing is defined where all other functions are included
def dataprocessing(x):
    x = rmvhtmltags(x)
    x = remove_urls(x)
    x = x.lower()
    x = rmvspclcharacter(x)
    x = remove_stopwords(x)
    x = strip_punctuation(x)
    x = strip_multiple_whitespaces(x)
    x = lemmatize_words(x)

    x = ' '.join([re.sub(r'\d+', '', i) for i in word_tokenize(x)])
    return x

In [12]:
reviews_train['review'] = reviews_train['review'].map(lambda x: dataprocessing(x))
reviews_test['review'] = reviews_test['review'].map(lambda x: dataprocessing(x))

In [13]:
reviews_train[:5]

Unnamed: 0,review,Label,file
21492,copy vhs think the television network play yea...,-1,6844_1.txt
9488,extremely rat point superb extremely please fi...,1,7290_10.txt
16933,know force sit thing film worth memorex dvd r ...,-1,2740_1.txt
12604,mt little sister self proclaim horror movie bu...,-1,10094_1.txt
8222,personally see disney movies lifetime absolute...,1,6150_7.txt


# Model Training

In [14]:
# separating them into lists
y_train_label = reviews_train['Label'].tolist()
x_train_review = reviews_train['review'].tolist()

y_test_label = reviews_test['Label'].tolist()
x_test_review = reviews_test['review'].tolist()

In [15]:
# Split to train and test data
X_train, X_test, y_train, y_test = train_test_split(x_train_review, y_train_label, test_size=0.3, random_state=42)

In [24]:
# Logistic Regression 
model_lr = Pipeline([('tfidf', TfidfVectorizer()),
                  ('clf', LogisticRegression()), ])
model_lr = model_lr.fit(X_train, y_train)

Cross Validation for Logistic regression on Count Vectorizer


array([0.87418581, 0.88033602, 0.87793588])

In [25]:
# Support Vector Machine 
model_svm = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()), ])
model_svm = model_svm.fit(X_train, y_train)

Cross Validation for SVM on Count Vectorizer


array([0.87829962, 0.87827876, 0.88033602])

In [23]:
# Naive Bayes with TfidfVectorizer
model_nb_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

# Fit the model
model_nb_tfidf = model_nb_tfidf.fit(X_train, y_train)

Cross Validation for Naive Bayes on TfidfVectorizer


array([0.85344532, 0.85993485, 0.85839191])

In [36]:
def model_training(training_model):
    if training_model == 'LR':
        print("Training Logistic regression model using bag of words")
        lrbow = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression()), ])
        return lrbow

    elif training_model == 'SVM':
        print("Training SVM model using bag of words")
        svmbow = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC()), ])
        return svmbow

    elif training_model == 'NB':
        print("Training NB model using bag of words")
        nbbow = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB()), ])
        return nbbow
    
def model_fitting_mix():
    training_model = ['LR', 'SVM', 'NB']
    for i in training_model:
        model_mix = model_training(i).fit(X_train, y_train)
        predicted_mix = model_mix.predict(X_test)

        # Accuracy
        accuracy_mix = accuracy_score(y_test, predicted_mix)

        # Precision, Recall, and F1 Score
        precision_mix = precision_score(y_test, predicted_mix, average='weighted')
        recall_mix = recall_score(y_test, predicted_mix, average='weighted')
        f1_mix = f1_score(y_test, predicted_mix, average='weighted')

        # Cross-validation metrics
        scores_mix = cross_val_score(model_mix, X_train, y_train, cv=3)
        print(f"Metrics for model '{i}':")
        print("Accuracy on testing dataset:", accuracy_mix)
        print("Precision on testing dataset:", precision_mix)
        print("Recall on testing dataset:", recall_mix)
        print("F1 Score on testing dataset:", f1_mix)
        print("Mean Accuracy on training dataset (cross-validation):", scores_mix.mean())
        print("\n")

In [37]:
model_fitting_mix()

Training Logistic regression model using bag of words
Metrics for model 'LR':
Accuracy on testing dataset: 0.8854666666666666
Precision on testing dataset: 0.8855887389603948
Recall on testing dataset: 0.8854666666666666
F1 Score on testing dataset: 0.8854483014528143
Mean Accuracy on training dataset (cross-validation): 0.8774859028626015


Training SVM model using bag of words
Metrics for model 'SVM':
Accuracy on testing dataset: 0.8790666666666667
Precision on testing dataset: 0.8790691201803766
Recall on testing dataset: 0.8790666666666667
F1 Score on testing dataset: 0.8790643767903418
Mean Accuracy on training dataset (cross-validation): 0.8789714669625178


Training NB model using bag of words
Metrics for model 'NB':
Accuracy on testing dataset: 0.8598666666666667
Precision on testing dataset: 0.8606280031798119
Recall on testing dataset: 0.8598666666666667
F1 Score on testing dataset: 0.8598204760761907
Mean Accuracy on training dataset (cross-validation): 0.8572573606880088




# Model Opitimization

In [None]:
# Define the parameter grid to search
param_grid = {
    'tfidf__max_features': [1000, 5000, 10000],  # Adjust the number of features
    'clf__C': [0.1, 1, 10],  # Regularization parameter for model
}

In [46]:
# Logistic Regression with TfidfVectorizer
model_lr_tuned = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])

# Create GridSearchCV object
grid_search_lr = GridSearchCV(model_lr_tuned, param_grid, cv=3, scoring='accuracy')

# Fit the grid search to the data
grid_search_lr.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Hyperparameter tuning using grid search for LR model")
print("Best Parameters: ", grid_search_lr.best_params_)
print("Mean Accuracy on training dataset (cross-validation): ", grid_search_lr.best_score_)
print("Accuracy on testing dataset:", grid_search_lr.score(X_test, y_test))

Hyperparameter tuning using grid search for LR model
Best Parameters:  {'clf__C': 1, 'tfidf__max_features': 10000}
Mean Accuracy on training dataset (cross-validation):  0.8770858897995112
Accuracy on testing dataset: 0.8838666666666667


In [47]:
# SVM with TfidfVectorizer
model_svm_tuned = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

# Create GridSearchCV object
grid_search_svm = GridSearchCV(model_svm_tuned, param_grid, cv=3, scoring='accuracy')

# Fit the grid search to the data
grid_search_svm.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Hyperparameter tuning using grid search for SVM model")
print("Best Parameters (SVM): ", grid_search_svm.best_params_)
print("Mean Accuracy on training dataset (cross-validation): ", grid_search_svm.best_score_)
print("Accuracy on testing dataset: ", grid_search_svm.score(X_test, y_test))

Hyperparameter tuning using grid search for SVM model
Best Parameters (SVM):  {'clf__C': 0.1, 'tfidf__max_features': 10000}
Mean Accuracy on training dataset (cross-validation):  0.8779430620483275
Accuracy on testing dataset:  0.8841333333333333


In [48]:
param_grid_nb = {
    'tfidf__max_features': [1000, 5000, 10000],
    'clf__alpha': [0.1, 0.5, 1.0],
}

# Naive Bayes with TfidfVectorizer
model_nb_tuned = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

# Create GridSearchCV object
grid_search_nb = GridSearchCV(model_nb_tuned, param_grid_nb, cv=3, scoring='accuracy')

# Fit the grid search to the data
grid_search_nb.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Hyperparameter tuning using grid search for NB  model")
print("Best Parameters (Naive Bayes): ", grid_search_nb.best_params_)
print("Mean Accuracy on training dataset (cross-validation): ", grid_search_nb.best_score_)
print("Accuracy on testing dataset: ", grid_search_nb.score(X_test, y_test))

Hyperparameter tuning using grid search for NB  model
Best Parameters (Naive Bayes):  {'clf__alpha': 1.0, 'tfidf__max_features': 10000}
Mean Accuracy on training dataset (cross-validation):  0.8527430618837655
Accuracy on testing dataset:  0.8538666666666667


# Prediction (Deploy)

In [65]:
def predict_sentiment(model, review, threshold_low=0.4, threshold_high=0.7):
    # Preprocess user input
    preprocessed_review = dataprocessing(user_review)
    
    # Use the trained model to predict sentiment
    confidence_scores = model.predict_proba([preprocessed_review])[0]
    
    # Extract probabilities for each class
    probability_negative = confidence_scores[0]
    probability_positive = confidence_scores[1]

    # Determine sentiment based on confidence scores and thresholds
    if probability_positive >= threshold_high:
        sentiment = "Positive"
    elif probability_positive <= threshold_low:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"

    return sentiment, confidence_scores

# Example usage
user_review = "This is just a normal movie!"
predicted_sentiment, confidence = predict_sentiment(model_lr, user_review)

print("Predicted Sentiment:", predicted_sentiment)
print("Confidence Scores:", confidence)

Predicted Sentiment: Neutral
Confidence Scores: [0.38121501 0.61878499]
