In [1]:
import numpy as np
import pandas as pd
import warnings
import os
import pickle

from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yashd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yashd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading the data

In [2]:
# Load the reviews and labels
# 1 for positive reviews and 0 for negative reviews
reviews = []
labels = []
path="aclImdb/train"
for label_type in ["pos", "neg"]:
    dir_path = os.path.join(path, label_type)
    for filename in os.listdir(dir_path):
        if filename.endswith(".txt"):
            with open(os.path.join(dir_path, filename), "r", encoding="utf-8") as f:
                review = f.read()
                reviews.append(review)
                labels.append(1 if label_type == "pos" else 0)

## Data Preprocessing

In [3]:
# Convert the reviews and labels to numpy arrays
reviews = np.array(reviews)
labels = np.array(labels)

In [4]:
reviews[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [5]:
labels[0]

1

In [6]:
#Checking balance of dataset
pd.Series(labels).value_counts()

0    12500
1    12500
dtype: int64

In [7]:
# Define a function to perform text preprocessing
def preprocess_text(text):
    # Remove HTML tags
    cleaned_text = re.sub('<[^>]*>', '', text)

    # Remove punctuation
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)

    # Convert to lowercase
    cleaned_text = cleaned_text.lower()

    #Lemmatize and remove stopwords
    lemmatizer=WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    cleaned_text=' '.join(lemmatizer.lemmatize(word) for word in cleaned_text.split()
                                                        if word not in stop_words)

    return cleaned_text

# Preprocess the movie reviews
cleaned_reviews = []
for review in reviews:
    cleaned_review = preprocess_text(review)
    cleaned_reviews.append(cleaned_review)

In [8]:
# Split the dataset into training, validation, and testing sets
train_reviews, test_reviews, train_labels, test_labels = train_test_split(cleaned_reviews, labels, test_size=0.15, random_state=42, shuffle=True, stratify=labels)
train_reviews, val_reviews, train_labels, val_labels = train_test_split(train_reviews, train_labels, test_size=0.15, random_state=42, shuffle=True, stratify=train_labels)

## Testing ML models

In [9]:
pipeline = make_pipeline(CountVectorizer(),TfidfTransformer())

X_train = pipeline.fit_transform(train_reviews)
X_val = pipeline.transform(val_reviews)
X_test = pipeline.transform(test_reviews)

In [22]:
import time
models = {
    "MultinomialNB": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(kernel="linear"),
    "Random Forest": RandomForestClassifier(),
    "XGBoost Classifier": XGBClassifier()
}

In [23]:
from sklearn.model_selection import GridSearchCV
model_list=[]

# Define the parameter grid for each model
nb_param_grid = {'alpha': [0.1, 0.01, 0.001, 0.0001]}

lr_param_grid = {'C': [0.1, 1, 10, 100],
                 'penalty': ['l1', 'l2']}

svm_param_grid = {'C': [0.1, 1, 10, 100]}

rf_param_grid = {'n_estimators': [50, 100, 200, 300],
                 'max_depth': [None, 5, 10, 20],
                 'min_samples_split': [2, 5, 10]}

xgb_param_grid = {'n_estimators': [50, 100, 200, 300],
                  'max_depth': [3, 5, 10],
                  'learning_rate': [0.1, 0.01, 0.001]}

# Define the parameter grids for all models in a dictionary
param_grids = {'MultinomialNB': nb_param_grid,
               'Logistic Regression': lr_param_grid,
               'SVM': svm_param_grid,
               'Random Forest': rf_param_grid,
               'XGBoost Classifier': xgb_param_grid}

for model_name,param in param_grids.items():

    start = time.time()
    algo = models[model_name]
    grid_search = GridSearchCV(estimator=algo,param_grid=param,cv=5,n_jobs=-1,scoring='accuracy')
    grid_search.fit(X_train,train_labels)
    best_para = grid_search.best_params_
    algo.set_params(**best_para)
    algo.fit(X_train,train_labels)
    y_pred = algo.predict(X_val)
    end = time.time()

    print(f"{model_name}")
    print(f"Time taken: {end-start}")
    model_list.append([model_name,accuracy_score(val_labels,y_pred),precision_score(val_labels,y_pred),recall_score(val_labels,y_pred),f1_score(val_labels,y_pred),best_para])
    print("----------------------------------------------------")

MultinomialNB
Time taken: 6.968690395355225
----------------------------------------------------
Logistic Regression
Time taken: 20.587644577026367
----------------------------------------------------
SVM
Time taken: 1976.6532769203186
----------------------------------------------------
Random Forest
Time taken: 4465.19069314003
----------------------------------------------------
XGBoost Classifier
Time taken: 5181.1491086483
----------------------------------------------------


In [24]:
models_df = pd.DataFrame(data=model_list,columns=['Model name','Accuracy','Precision','Recall','f1-score','Best Parameters'])
models_df

Unnamed: 0,Model name,Accuracy,Precision,Recall,f1-score,Best Parameters
0,MultinomialNB,0.8601,0.86514,0.853199,0.859128,{'alpha': 0.1}
1,Logistic Regression,0.896801,0.888275,0.907779,0.897921,"{'C': 10, 'penalty': 'l2'}"
2,SVM,0.898996,0.890663,0.909661,0.900062,{'C': 1}
3,Random Forest,0.861669,0.868842,0.851945,0.86031,"{'max_depth': None, 'min_samples_split': 5, 'n..."
4,XGBoost Classifier,0.863237,0.856527,0.872647,0.864512,"{'learning_rate': 0.1, 'max_depth': 10, 'n_est..."


## Choosing Logistic Regression for our analysis

In [10]:
model = LogisticRegression(C=10,penalty='l2')
model.fit(X_train,train_labels)
prediction = model.predict(X_test)

In [11]:
print(classification_report(prediction,test_labels))

              precision    recall  f1-score   support

           0       0.86      0.89      0.88      1825
           1       0.89      0.87      0.88      1925

    accuracy                           0.88      3750
   macro avg       0.88      0.88      0.88      3750
weighted avg       0.88      0.88      0.88      3750



In [12]:
with open('ml_model.pickle', 'wb') as f:
    pickle.dump(model, f)

In [13]:
with open('preprocessor.pickle', 'wb') as f:
    pickle.dump(pipeline, f)