In [1]:
# Import libs
import warnings
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC

warnings.filterwarnings(action="ignore")
import os
import re
import nltk
import sys

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
import en_core_web_sm

en_core_web_sm.load()
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load Dataset

In [2]:
# Data file path
PATH = "E:\\NLP\\Final\\aclImdb"
sys.getdefaultencoding()

'utf-8'

In [3]:
# Separating train files to positive and negative
posFiles = [x for x in os.listdir(PATH + "/train/pos/") if x.endswith(".txt")]
negFiles = [x for x in os.listdir(PATH + "/train/neg/") if x.endswith(".txt")]

In [4]:
# Separating test files to positive and negative
test_pos_Files = [x for x in os.listdir(PATH + "/test/pos/") if x.endswith(".txt")]
test_neg_Files = [x for x in os.listdir(PATH + "/test/neg/") if x.endswith(".txt")]

In [5]:
P_train = []
N_train = []

for nfile in negFiles:
    with open(PATH + "/train/neg/" + nfile, encoding="utf-8") as f:
        N_train.append(f.read())
        
for pfile in posFiles:
    with open(PATH + "/train/pos/" + pfile, encoding="utf-8") as f:
        P_train.append(f.read())

In [6]:
P_test = []
N_test = []

for ptestfile in test_pos_Files:
    with open(PATH + "/test/pos/" + ptestfile, encoding="utf-8") as f:
        P_test.append(f.read())
        
for ntestfile in test_neg_Files:
    with open(PATH + "/test/neg/" + ntestfile, encoding="utf-8") as f:
        N_test.append(f.read())

In [7]:
reviews_train = pd.concat([
    pd.DataFrame({"review": P_train, "Label": 1, "file": posFiles}),
    pd.DataFrame({"review": N_train, "Label": -1, "file": negFiles})
], ignore_index=True).sample(frac=1, random_state=1)

reviews_test = pd.concat([
    pd.DataFrame({"review": P_test, "Label": 1, "file": test_pos_Files}),
    pd.DataFrame({"review": N_test, "Label": -1, "file": test_neg_Files})
], ignore_index=True).sample(frac=1, random_state=1)

In [8]:
reviews_train

Unnamed: 0,review,Label,file
21492,"I have copy of this on VHS, I think they (The ...",-1,6844_1.txt
9488,After several extremely well ratings to the po...,1,7290_10.txt
16933,I still don't know why I forced myself to sit ...,-1,2740_1.txt
12604,Mt little sister and I are self-proclaimed hor...,-1,10094_1.txt
8222,I have personally seen many Disney movies in m...,1,6150_7.txt
...,...,...,...
10955,Diane Keaton gave an outstanding performance i...,1,8610_10.txt
17289,"This has to be creepiest, most twisted holiday...",-1,3060_1.txt
5192,"Do not expect a depiction of the ""truth"". Howe...",1,3423_7.txt
12172,The League of Gentlemen is one of the funniest...,1,9706_10.txt


In [9]:
reviews_test

Unnamed: 0,review,Label,file
21492,A movie theater with a bad history of past gru...,-1,6844_2.txt
9488,"""Here On Earth"" is a surprising beautiful roma...",1,7290_10.txt
16933,I just watched Descent. Gawds what an awful mo...,-1,2740_3.txt
12604,In a nutshell the movie is about a gang war in...,-1,10094_4.txt
8222,"Instead of watching the recycled history of ""P...",1,6150_7.txt
...,...,...,...
10955,This movie is a fascinating drama about the Ma...,1,8610_8.txt
17289,"It's too kind to call this a ""fictionalized"" a...",-1,3060_3.txt
5192,I was unsure of this movie before renting and ...,1,3423_9.txt
12172,"Just got out of an advance screening, and wow ...",1,9706_7.txt


# Data Preprocessing

Here we remove html tags, urls, special characters,Lemmanatize-
which is better than stemming as it gives a proper word after cutting

In [10]:
stopWords = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [11]:
def rmvhtmltags(text):
    remreg = re.compile('<.*?>')
    cleartext = re.sub(remreg, '', text)
    return cleartext

def remove_urls(vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return (vTEXT)

def rmvspclcharacter(text):
    clearspcl = re.sub(r'[^A-Za-z0-9\s.]', r'', str(text).lower())
    clearspcl = re.sub(r'\n', r' ', text)
    clearspcl = " ".join([word for word in text.split() if word not in stopWords])
    return clearspcl


def lemmatize_words(text):
    lemmatized_words = [lemmatizer.lemmatize(word, 'v') for word in text.split()]
    return ('  '.join(lemmatized_words))


# A function dataprocessing is defined where all other functions are included
def dataprocessing(x):
    x = rmvhtmltags(x)
    x = remove_urls(x)
    x = x.lower()
    x = rmvspclcharacter(x)
    x = remove_stopwords(x)
    x = strip_punctuation(x)
    x = strip_multiple_whitespaces(x)
    x = lemmatize_words(x)

    x = ' '.join([re.sub(r'\d+', '', i) for i in word_tokenize(x)])
    return x

In [12]:
reviews_train['review'] = reviews_train['review'].map(lambda x: dataprocessing(x))
reviews_test['review'] = reviews_test['review'].map(lambda x: dataprocessing(x))

In [13]:
reviews_train[:5]

Unnamed: 0,review,Label,file
21492,copy vhs think the television network play yea...,-1,6844_1.txt
9488,extremely rat point superb extremely please fi...,1,7290_10.txt
16933,know force sit thing film worth memorex dvd r ...,-1,2740_1.txt
12604,mt little sister self proclaim horror movie bu...,-1,10094_1.txt
8222,personally see disney movies lifetime absolute...,1,6150_7.txt


# Model Training

In [14]:
# separating them into lists
y_train_label = reviews_train['Label'].tolist()
x_train_review = reviews_train['review'].tolist()

y_test_label = reviews_test['Label'].tolist()
x_test_review = reviews_test['review'].tolist()

In [15]:
# Split to train and test data
X_train, X_test, y_train, y_test = train_test_split(x_train_review, y_train_label, test_size=0.3, random_state=42)

## Logistic Regression

In [16]:
# Logistic Regression 
model_lr = Pipeline([('tfidf', TfidfVectorizer()),
                  ('clf', LogisticRegression()), ])
model_lr = model_lr.fit(X_train, y_train)

print("Cross Validation for Logistic Regression on TF-IDF")
cross_val_score(model_lr, X_train, y_train, cv=3)

Cross Validation for Logistic Regression on TF-IDF


array([0.87418581, 0.88033602, 0.87793588])

## Support Vector Machine

In [17]:
# Support Vector Machine 
model_svm_tfidf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()), ])
model_svm_tfidf = model_svm_tfidf.fit(X_train, y_train)

print("Cross Validation for Support Vector Machine on TF-IDF")
cross_val_score(model_svm_tfidf, X_train, y_train, cv=3)

Cross Validation for Support Vector Machine on TF-IDF


array([0.87829962, 0.87827876, 0.88033602])

## Naive Bayes 

In [18]:
# Naive Bayes with Bag of Words
model_nb_bow = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', MultinomialNB()),
])

# Fit the model
model_nb_bow = model_nb_bow.fit(X_train, y_train)

print("Cross Validation for Naive Bayes on Bag of Words")
cross_val_score(model_nb_bow, X_train, y_train, cv=3)

Cross Validation for Naive Bayes on Bag of Words


array([0.8416181 , 0.85376307, 0.85222013])

## LSTM

In [19]:
# Encode the labels
label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [20]:
# Tokenize the text data
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, split=' ')
tokenizer.fit_on_texts(X_train)
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [21]:
# Pad sequences to ensure uniform length
max_len = 200
X_train_pad = pad_sequences(X_train_tokens, maxlen=max_len)
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_len)

In [22]:
# Define the LSTM model
embedding_dim = 100
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model_lstm.add(SpatialDropout1D(0.2))
model_lstm.add(LSTM(100, dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))

# Compile the model
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model
batch_size = 64
epochs = 5
model_lstm.fit(X_train_pad, encoded_y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test_pad, encoded_y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2881aea2f40>

In [23]:
# Evaluate the model
loss, accuracy = model_lstm.evaluate(X_test_pad, encoded_y_test, verbose=1)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8604000210762024


In [24]:
def model_training(training_model):
    if training_model == 'LR':
        print("Training Logistic regression model using TF-IDF")
        lrbow = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression()), ])
        return lrbow

    elif training_model == 'SVM':
        print("Training SVM model using TF-IDF")
        svmbow = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC()), ])
        return svmbow

    elif training_model == 'NB':
        print("Training NB model using BOW")
        nbbow = Pipeline([('bow', CountVectorizer()), ('clf', MultinomialNB()), ])
        return nbbow
    
def model_training_lstm():
    print("Training LSTM model")
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=100, input_length=200))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(units=100, dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def model_fitting_mix():
    training_models = ['LR', 'SVM', 'NB', 'LSTM']
    for model_name in training_models:
        if model_name == 'LSTM':
            batch_size = 64
            epochs = 5
            lstm_model = model_training_lstm()
            lstm_model.fit(X_train_pad, encoded_y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test_pad, encoded_y_test))
            lstm_scores = lstm_model.evaluate(X_test_pad, encoded_y_test, verbose=0)
            
            y_pred = lstm_model.predict(X_test_pad)
            y_pred_binary = (y_pred > 0.5).astype('int32')  # Convert probabilities to binary predictions

            precision = precision_score(encoded_y_test, y_pred_binary)
            recall = recall_score(encoded_y_test, y_pred_binary)
            f1 = f1_score(encoded_y_test, y_pred_binary)

            print(f"Metrics for model 'LSTM':")
            print(f"Accuracy on testing dataset: {lstm_scores[1]:.6f}")
            print(f"Precision on testing dataset: {precision:.6f}")
            print(f"Recall on testing dataset: {recall:.6f}")
            print(f"F1 Score on testing dataset: {f1:.6f}")
        else:
            model = model_training(model_name).fit(X_train, y_train)
            predicted = model.predict(X_test)

            # Calculate metrics
            accuracy = accuracy_score(y_test, predicted)
            precision = precision_score(y_test, predicted, average='weighted')
            recall = recall_score(y_test, predicted, average='weighted')
            f1 = f1_score(y_test, predicted, average='weighted')
            scores = cross_val_score(model, X_train, y_train, cv=3)

            # Print metrics
            print(f"Metrics for model '{model_name}':")
            print(f"Accuracy on testing dataset: {accuracy:.6f}")
            print(f"Precision on testing dataset: {precision:.6f}")
            print(f"Recall on testing dataset: {recall:.6f}")
            print(f"F1 Score on testing dataset: {f1:.6f}")
            print(f"Mean Accuracy on training dataset (cross-validation): {scores.mean():.6f}")
        print("\n")

In [25]:
model_fitting_mix()

Training Logistic regression model using TF-IDF
Metrics for model 'LR':
Accuracy on testing dataset: 0.885467
Precision on testing dataset: 0.885589
Recall on testing dataset: 0.885467
F1 Score on testing dataset: 0.885448
Mean Accuracy on training dataset (cross-validation): 0.877486


Training SVM model using TF-IDF
Metrics for model 'SVM':
Accuracy on testing dataset: 0.879067
Precision on testing dataset: 0.879069
Recall on testing dataset: 0.879067
F1 Score on testing dataset: 0.879064
Mean Accuracy on training dataset (cross-validation): 0.878971


Training NB model using BOW
Metrics for model 'NB':
Accuracy on testing dataset: 0.855867
Precision on testing dataset: 0.856675
Recall on testing dataset: 0.855867
F1 Score on testing dataset: 0.855815
Mean Accuracy on training dataset (cross-validation): 0.849200


Training LSTM model
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Metrics for model 'LSTM':
Accuracy on testing dataset: 0.858533
Precision on testing dataset: 0.84176

# Model Opitimization

In [26]:
# Define the parameter grid to search
param_grid = {
    'tfidf__max_features': [1000, 5000, 10000],  # Adjust the number of features
    'clf__C': [0.1, 1, 10],  # Regularization parameter for model
}

In [22]:
# Logistic Regression with TfidfVectorizer
model_lr_tuned = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])

# Create GridSearchCV object
grid_search_lr = GridSearchCV(model_lr_tuned, param_grid, cv=3, scoring='accuracy')

# Fit the grid search to the data
grid_search_lr.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Hyperparameter tuning using grid search for LR model")
print("Best Parameters: ", grid_search_lr.best_params_)
print("Mean Accuracy on training dataset (cross-validation): ", grid_search_lr.best_score_)
print("Accuracy on testing dataset:", grid_search_lr.score(X_test, y_test))

Hyperparameter tuning using grid search for LR model
Best Parameters:  {'clf__C': 1, 'tfidf__max_features': 10000}
Mean Accuracy on training dataset (cross-validation):  0.8770858897995112
Accuracy on testing dataset: 0.8838666666666667


In [23]:
# SVM with TfidfVectorizer
model_svm_tuned = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

# Create GridSearchCV object
grid_search_svm = GridSearchCV(model_svm_tuned, param_grid, cv=3, scoring='accuracy')

# Fit the grid search to the data
grid_search_svm.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Hyperparameter tuning using grid search for SVM model")
print("Best Parameters (SVM): ", grid_search_svm.best_params_)
print("Mean Accuracy on training dataset (cross-validation): ", grid_search_svm.best_score_)
print("Accuracy on testing dataset: ", grid_search_svm.score(X_test, y_test))

Hyperparameter tuning using grid search for SVM model
Best Parameters (SVM):  {'clf__C': 0.1, 'tfidf__max_features': 10000}
Mean Accuracy on training dataset (cross-validation):  0.8779430620483275
Accuracy on testing dataset:  0.8841333333333333


In [24]:
param_grid_nb = {
    'tfidf__max_features': [1000, 5000, 10000],
    'clf__alpha': [0.1, 0.5, 1.0],
}

# Naive Bayes with Bag of Words
model_nb_tuned = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', MultinomialNB()),
])

# Create GridSearchCV object
grid_search_nb = GridSearchCV(model_nb_tuned, param_grid_nb, cv=3, scoring='accuracy')

# Fit the grid search to the data
grid_search_nb.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Hyperparameter tuning using grid search for NB  model")
print("Best Parameters (Naive Bayes): ", grid_search_nb.best_params_)
print("Mean Accuracy on training dataset (cross-validation): ", grid_search_nb.best_score_)
print("Accuracy on testing dataset: ", grid_search_nb.score(X_test, y_test))

Hyperparameter tuning using grid search for NB  model
Best Parameters (Naive Bayes):  {'clf__alpha': 1.0, 'tfidf__max_features': 10000}
Mean Accuracy on training dataset (cross-validation):  0.8527430618837655
Accuracy on testing dataset:  0.8538666666666667


In [47]:
# Define a function to create the LSTM model
def create_lstm_model(units=100, dropout_rate=0.2):
  model = Sequential()
  model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
  model.add(LSTM(units, dropout=dropout_rate))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

# Create a KerasClassifier
lstm_model = KerasClassifier(build_fn=create_lstm_model, verbose=0)

# Define the parameter grid
param_grid_lstm = {
  'units': [50, 100, 150],
  'dropout_rate': [0.2, 0.3, 0.4],
  'epochs': [5, 10, 15],
  'batch_size': [32, 64, 128]
}

# Perform RandomizedSearchCV
random_search_lstm = RandomizedSearchCV(estimator=lstm_model, param_distributions=param_grid_lstm, cv=3, scoring='accuracy', n_iter=10)
random_search_lstm.fit(X_train_pad, encoded_y_train)

# Print the best parameters and their corresponding accuracy
print("Hyperparameter tuning using randomized search for LSTM model")
print("Best Parameters (LSTM): ", random_search_lstm.best_params_)
print("Mean Accuracy on training dataset (cross-validation): ", random_search_lstm.best_score_)
print("Accuracy on testing dataset: ", random_search_lstm.score(X_test_pad, encoded_y_test))

Hyperparameter tuning using randomized search for LSTM model
Best Parameters (LSTM):  {'units': 150, 'epochs': 5, 'dropout_rate': 0.4, 'batch_size': 64}
Mean Accuracy on training dataset (cross-validation):  0.857714412124789
Accuracy on testing dataset:  0.8645333333333334


# Prediction (Deploy)

In [52]:
def map_to_label(prediction, threshold_low=0.4, threshold_high=0.7):
    if prediction >= threshold_high:
        return 'Positive'
    elif prediction <= threshold_low:
        return 'Negative'
    else:
        return 'Neutral'

def predict_sentiment_all_models(user_review):
    # Preprocess the review
    preprocessed_review = dataprocessing(user_review)

    # Tokenize the preprocessed review
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts([preprocessed_review])
    sequences = tokenizer.texts_to_sequences([preprocessed_review])

    # Pad sequences to ensure consistent length
    max_sequence_length = 200
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

    # Predict sentiment using LSTM model
    lstm_predicted_sentiment = model_lstm.predict(padded_sequences)[0]
    lstm_label = map_to_label(lstm_predicted_sentiment)

    # Logistic Regression with TF-IDF
    lr_tfidf_prediction = model_lr.predict_proba([user_review])[0][1]
    lr_tfidf_label = map_to_label(lr_tfidf_prediction)

    # Support Vector Machine with TF-IDF
    svm_tfidf_prediction = model_svm_tfidf.decision_function([user_review])[0]
    svm_tfidf_prediction = 1 / (1 + np.exp(-svm_tfidf_prediction))
    svm_tfidf_label = map_to_label(svm_tfidf_prediction)

    # Naive Bayes with Bag of Words
    nb_bow_prediction = model_nb_bow.predict_proba([user_review])[0][1]
    nb_bow_label = map_to_label(nb_bow_prediction)

    return {
        'Logistic Regression (TF-IDF)': {'score': '{:.3f}'.format(lr_tfidf_prediction), 'label': lr_tfidf_label},
        'Support Vector Machine (TF-IDF)': {'score': '{:.3f}'.format(svm_tfidf_prediction), 'label': svm_tfidf_label},
        'Naive Bayes (Bag of Words)': {'score': '{:.3f}'.format(nb_bow_prediction), 'label': nb_bow_label},
        'LSTM Model': {'score': '{:.3f}'.format(lstm_predicted_sentiment[0]), 'label': lstm_label}
    }

# Example usage
user_review = """
This is a normal movie!
"""
predictions = predict_sentiment_all_models(user_review)

# Print predictions for each model
for model_name, prediction_info in predictions.items():
    print(f"{model_name}: Prediction: {prediction_info['score']}, Label: {prediction_info['label']}")

Logistic Regression (TF-IDF): Prediction: 0.447, Label: Neutral
Support Vector Machine (TF-IDF): Prediction: 0.460, Label: Neutral
Naive Bayes (Bag of Words): Prediction: 0.335, Label: Negative
LSTM Model: Prediction: 0.258, Label: Negative
