In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import necessary libraries used in the process. 
import seaborn as sns
from matplotlib import pyplot as plt
from pprint import pprint

import os
import requests
import time

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler, MaxAbsScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.feature_selection import RFE, SequentialFeatureSelector, SelectKBest, chi2
from sklearn import set_config
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from datetime import date
from datetime import datetime

#Ignore Future Warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# User Defined functions

In [None]:
# Preprocess the date columns
# Convert the dates to no.of days from 1st January 2023 (as base)
# The below are the two date fields
# releaseDateTheatres & release DateStreaming

def isNaN(string):
    return string != string

def numOfDays(date):
    format =  "%Y-%m-%d"
    if isNaN(date):
        return np.NaN
    else: 
        try:
             #bool(datetime.strptime(test_str, format))
            date1 = datetime.strptime("2023-07-31", "%Y-%m-%d")
            date2 = datetime.strptime(date, "%Y-%m-%d")
            return (date1-date2).days
        except ValueError:
            return np.NaN
        
# Evaluate the model's performance
def generate_score(y_val, y_pred):
    accuracy = metrics.accuracy_score(y_val, y_pred)
    precision = metrics.precision_score(y_val, y_pred, average="binary", pos_label=1)
    recall = metrics.recall_score(y_val, y_pred, average="binary", pos_label=1)
    f1_score = metrics.f1_score(y_val, y_pred, average="binary", pos_label=1)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1_score)
    
# Convert the values of prediction data with the respective labels
def map_pred_labels(pred):
    prediction = ['POSITIVE' if val == 1 else 'NEGATIVE' for val in pred]
    return np.array(prediction,dtype=object)

# Read the movies dataset

In [None]:
movies_data = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv')
movies_data.shape
print(f"The shape of the movies dataset is: ", movies_data.shape)
#movies_data.dtypes
# audienceScore and runtimeMinutes are numeric values and the rest are objects


In [None]:
# Remove the duplicates from movie dataset based on column 'movieid'
movies_data = movies_data.drop_duplicates(subset=['movieid'], keep='first').reset_index(drop=True)
print(f"The shape of the movies dataset after removing duplicates is: ", movies_data.shape)


In [None]:
#for i in range(len(movies_data)):
#    if isNaN(movies_data['boxOffice'][i]):
#        pass
 #   else:
#        movies_data['boxOffice'][i] = str(movies_data['boxOffice'][i]).replace('$','').replace('M','').replace('K','')

In [None]:
#movies_data['boxOffice'].replace(np.nan, '0', inplace=True)
#movies_data['boxOffice'] = pd.to_numeric(movies_data['boxOffice'])

In [None]:
# Preprocess the Date columns to number of days based on a reference date
movies_data['releaseDateTheaters'] = movies_data['releaseDateTheaters'].apply(numOfDays)
movies_data['releaseDateStreaming'] = movies_data['releaseDateStreaming'].apply(numOfDays)


In [None]:
movies_data.isnull().sum()

In [None]:
movies_data.head(1)

# Read the training dataset

In [None]:
# Read the training dataset
train_data_raw = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv')
print(f"The shape of the train dataset is: ", train_data_raw.shape)


#Remove duplicate rows from the train data since the duplicates may create a bias on the result
train_data_raw = train_data_raw.drop_duplicates()
print(f"The shape of the train dataset after removing duplicates is: ", train_data_raw.shape)

In [None]:
train_data_raw.head()

In [None]:
# Merge the movies data with train data taking the key as 'movieid'
train_data = pd.merge(train_data_raw, movies_data, on="movieid", how="left")
print(f"The shape of the train dataset after merging movie data is: ", train_data.shape)

In [None]:
train_data.head(1)

# Read the test dataset

In [None]:
# Read the test dataset
test_data_raw = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv')
print(f"The shape of the test dataset is: ", test_data_raw.shape)

In [None]:
test_data_raw.head()

In [None]:
# Merge the train dataset with movies dataset based on movieid
test_data = pd.merge(test_data_raw, movies_data, on="movieid", how="left")
print(f"The shape of the test dataset after merging with movie data is: ", test_data.shape)

In [None]:
test_data.head(1)

# EDA

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data.duplicated().sum()

In [None]:
# find the number of unique values in each column
for i in range(len(train_data.columns)):
    print(f"The number of unique values in column ", train_data.columns[i], "", train_data.iloc[:,i].nunique())
    
print(f"The unique values in column 'sentiment' is: ", train_data['sentiment'].unique())
print(f"The unique values in column 'rating' is: ", train_data['rating'].unique())

In [None]:
# Plot the distribution of sentiments in the train dataset
fig = plt.figure(figsize=(6,4))
ax = fig.add_subplot(111)
train_data['sentiment'].value_counts().plot(kind='bar', color='skyblue')
plt.xlabel('Sentiment')
plt.ylabel('Counts')
plt.title('Distribution of Sentiment')
plt.show

In [None]:
# Plot the distribution of sentiments in the train dataset
plt.figure(figsize=(6,4))
train_data['rating'].value_counts().plot(kind='bar', color='skyblue')
plt.xlabel('Rating')
plt.ylabel('Counts')
plt.title('Distribution of Rating')
plt.show


In [None]:
# Plot the distribution of isFrequentReviewer in the train dataset
plt.figure(figsize=(6,4))
train_data['isFrequentReviewer'].value_counts().plot(kind='bar', color='skyblue')
plt.xlabel('isFrequentReviewer')
plt.ylabel('Counts')
plt.title('Distribution of FrequentReviewer')
plt.show

In [None]:
#plt.figure(figsize=(4,2))
#sns.boxplot(data=train_data, x="sentiment", y="rating")

In [None]:
# Plot sentiment vs audienceScore
plt.figure(figsize=(6,4))
sns.boxplot(data=train_data, x="sentiment", y="audienceScore")

In [None]:
#Plot the unique values

#plt.figure(figsize=(6,4))
#sns.boxplot(data=train_data, x="sentiment", y="isFrequentReviewer")
#sns.boxplot(train_data['runtimeMinutes'])

In [None]:
train_data.corr()

In [None]:
#Correlation plot

sns.heatmap(train_data.corr(),annot=True)

# Key Insights

All the below insights are derived based on the train data after the movies data is merged. 
1. Columns movieid and title are correlated with each other directly. Only one of the column is used as the feature
2. 67% of data returns POSITIVE and 33% returns NEGATIVE sentiment.
3. The top three ratings that contribute to the movie reviews are R, PG-13, PG
4. releaseDateTheaters and releaseDateStreaming columns are not adding value to the model prediction. 
5. These columns donot contribute much to model prediction - genre, originalLanguage, director, boxOffice, distributor, soundType
6. If the audienceScore is greater than 70, the sentiment of the movie review is mostly predicted POSITIVE
7. Most of the columns are categorical. For Data Preprocessing, LabelEncoder can be used first and then OneHotEncoder can be used. 
8. The main column reviewText is a text column and so the TfidVectorizer can be used. 

# Data Preprocesing

In [None]:
# reviewText column in both train and test dataset have text values. Before we process it
# we are inserting the  null values to whitespace string value so that the preprocessor can handle it.
print(f"The number of rows that have null values in reviewText column of train data is: "
      , train_data['reviewText'].isnull().sum())
print(f"The number of rows that have null values in reviewText column of train data is: "
      , test_data['reviewText'].isnull().sum())

train_data['reviewText'].replace(np.nan, ' ', inplace=True)
test_data['reviewText'].replace(np.nan, ' ', inplace=True)
test_data.rename(columns = {'isTopCritic':'isFrequentReviewer'}, inplace = True)

print(f"The number of rows that have null values in reviewText column of train data after preprocessing is: "
      , train_data['reviewText'].isnull().sum())
print(f"The number of rows that have null values in reviewText column of train data after preprocessing is: "
      , test_data['reviewText'].isnull().sum())

In [None]:
train_data.columns

In [None]:
# Encode the categorical columns using Label Encoder first
labelenc = LabelEncoder()
train_data['movieid'] = labelenc.fit_transform(train_data['movieid'])
train_data['reviewerName'] = labelenc.fit_transform(train_data['reviewerName'])
train_data['isFrequentReviewer'] = labelenc.fit_transform(train_data['isFrequentReviewer'])
train_data['originalLanguage'] = labelenc.fit_transform(train_data['originalLanguage'])
train_data['director'] = labelenc.fit_transform(train_data['director'])
train_data['distributor'] = labelenc.fit_transform(train_data['distributor'])
train_data['soundType'] = labelenc.fit_transform(train_data['soundType'])
train_data['rating'] = labelenc.fit_transform(train_data['rating'])

In [None]:
# Apply the same encoding for test dataset similar to train dataset
test_data['movieid'] = labelenc.fit_transform(test_data['movieid'])
test_data['reviewerName'] = labelenc.fit_transform(test_data['reviewerName'])
test_data['isFrequentReviewer'] = labelenc.fit_transform(test_data['isFrequentReviewer'])
test_data['originalLanguage'] = labelenc.fit_transform(test_data['originalLanguage'])
test_data['director'] = labelenc.fit_transform(test_data['director'])
test_data['distributor'] = labelenc.fit_transform(test_data['distributor'])
test_data['rating'] = labelenc.fit_transform(test_data['rating'])
train_data['soundType'] = labelenc.fit_transform(train_data['soundType'])

In [None]:
train_data = train_data.drop(columns=['title'])

In [None]:
train_data.isnull().sum()

In [None]:
train_data['ratingContents'].replace(np.nan, ' ', inplace=True)
train_data['genre'].replace(np.nan, ' ', inplace=True)
train_data['boxOffice'].replace(np.nan, ' ', inplace=True)

test_data['ratingContents'].replace(np.nan, ' ', inplace=True)
test_data['genre'].replace(np.nan, ' ', inplace=True)
test_data['boxOffice'].replace(np.nan, ' ', inplace=True)


In [None]:
train_data.dtypes

In [None]:
train_data.head(1)

In [None]:
# List out the text, numerical and categorical columns for preprocessing
#numerical_columns = ['audienceScore', 'runtimeMinutes', 'releaseDateTheaters', 'releaseDateStreaming']
#categorical_columns = ['movieid', 'reviewerName', 'isFrequentReviewer', 'rating', 'ratingContents',       
#       'genre', 'director', 'originalLanguage', 'director', 'boxOffice', 'soundType']
numerical_columns = ['audienceScore', 'runtimeMinutes']
categorical_columns = ['movieid', 'reviewerName', 'isFrequentReviewer', 'rating', 'ratingContents']
text_column = 'reviewText'

In [None]:
# Create a pipeline for the preprocessing on the text, categorical and numerical values except for the target variable

imputer = ColumnTransformer(transformers=[('impute', SimpleImputer(strategy = 'mean'), numerical_columns)], remainder="drop")
numerical_pipe = Pipeline(steps=[('numeric', imputer),
                                 ('scale',MinMaxScaler())])

categorical_pipe = ColumnTransformer(transformers = [('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)])
    
tfid_vectorizer = TfidfVectorizer(stop_words=['english'])
text_pipe = ColumnTransformer(transformers = [('tfid',tfid_vectorizer, 'reviewText')])

complete_pipe = FeatureUnion([('numeric', numerical_pipe)
                              ,('cat', categorical_pipe)
                              ,('tfid', text_pipe)])

pipe = Pipeline (steps =[('cp',complete_pipe)])

set_config(display='diagram')   
# displays HTML representation in a jupyter context
pipe 

In [None]:
# Apply the transformation using the above pipeline on both train and test data
train_data_transformed = pipe.fit_transform(train_data)                                     
print(f"The size of train data after transformtion is : ",train_data_transformed.shape)
test_data_transformed = pipe.transform(test_data)
print(f"The size of test data after transformtion is : ",test_data_transformed.shape)

In [None]:
# Apply the transformation for the target variable on the train dataset.
lbe = LabelEncoder()
y_train_data = lbe.fit_transform(train_data['sentiment'])
print(f"The size of the target data after the transformation is: ", y_train_data.shape)
y_train_data

# Feature Selection

In [None]:
# 30000 samples
select = SelectKBest(score_func=chi2, k=30000)
train_data_features = select.fit_transform(train_data_transformed,y_train_data)
print("After selecting best k features in train data:", train_data_features.shape) 
test_data_features = select.transform(test_data_transformed)
print("After selecting best k features:", test_data_features.shape) 

In [None]:
# Create a split on the training dataset for the purpose of validation of our model.
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data_features, y_train_data, test_size=0.2, random_state=64)

print(f"The shape of the X train dataset: ", X_train.shape)
print(f"The shape of the X val dataset", X_val.shape)

# Model Selection

# 1. Logistic Regression Model

In [None]:
# Logistic Regression model
#log_reg = LogisticRegression(solver='saga', C=3, random_state=64, max_iter=100, tol=1e-3, penalty='l2')
log_reg = LogisticRegression(solver='saga', C=3, random_state=64, max_iter=100, tol=1e-3, penalty='l2')
start_time = time.time()
log_reg.fit(X_train, y_train)

In [None]:
log_reg.score(X_val, y_val)

In [None]:
#grid={'C': [1,3, 5, 7], "penalty":["l2"], "max_iter": [100, 500], "solver": ['sag', 'saga'], "tol": [1e-3]}# l1 lasso l2 ridge
#logreg=LogisticRegression(random_state=64)
#logreg_cv=GridSearchCV(logreg,grid,cv=5, scoring = 'accuracy',verbose=0)
#logreg_cv.fit(X_train,y_train)

#print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
#print("accuracy :",logreg_cv.best_score_)
#tuned hpyerparameters :(best parameters)  {'C': 7, 'max_iter': 100, 'penalty': 'l2', 'solver': 'sag', 'tol': 0.001}
#accuracy : 0.8380970123912442

In [None]:
# Create the confusion matrix for the predicted results with the actual results
y_pred = log_reg.predict(X_val)
confusion_matrix = confusion_matrix(y_val, y_pred)
#labels = ['True Neg','False Pos','False Neg','True Pos']
#labels = np.asarray(labels).reshape(2,2)
sns.heatmap(confusion_matrix, annot=True, fmt='')
# 0 - NEGATIVE, 1 - POSITIVE

In [None]:
# Logistic Regression
#y_pred = log_reg.predict(X_val)
print(f"The shape of the predicted value is: ", y_pred.shape)
y_val = np.asarray(y_val)
print(f"The shape of the actual value is: ", y_val.shape)

In [None]:
# Predict the model results on the Validation dataset
#y_pred = log_reg.predict(X_val)

end_time = time.time()

# print the time taken for the execution of this model in milliseconds
print("The time taken to execute this model is : ", (end_time - start_time) * 10**3, "ms")

print(f"The performance metrics calculated for the Logistic Regression model is: ")
# Calculate the model performance metrics
generate_score(y_val, y_pred)

# calculate f1 micro score
log_reg_score = metrics.f1_score(y_val, y_pred, average="binary", pos_label=1)

In [None]:
# Predict the model results on the Test dataset
y_test = log_reg.predict(test_data_features)
print(f"The shape of the predicted value on the test data", y_test.shape)
# Map the labels with the predicted values
prediction = map_pred_labels(y_test)

# 2. Perceptron Model

In [None]:
# Instantiate the perceptron classifier
perceptron = Perceptron(max_iter=1000, tol=1e-3, fit_intercept=True, shuffle=True, random_state=64, verbose=False, eta0=1.0, n_jobs=None, penalty=None, 
                        alpha=0.0001, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False)
start_time = time.time()
# Fit the perceptron model on the training data
perceptron.fit(X_train, y_train)

# Predict the model results
y_pred = perceptron.predict(X_val)

end_time = time.time()
# print the time taken for the execution of this model in milliseconds
print("The time taken to execute this model is : ", (end_time - start_time) * 10**3, "ms")

print(f"The performance metrics calculated for the Perceptron model is: ")
generate_score(y_val, y_pred)

# calculate f1 micro score
perceptron_score = metrics.f1_score(y_val, y_pred, average="binary", pos_label=1)

# 3. MultinomialNB Model

In [None]:
# Instantiate the MultinomialNB classifier
multinomial = MultinomialNB()

start_time = time.time()
# Fit the multinomial model on the training data
multinomial.fit(X_train, y_train)

# Predict the model results
y_pred = multinomial.predict(X_val)

end_time = time.time()
# print the time taken for the execution of this model in milliseconds
print("The time taken to execute this model is : ", (end_time - start_time) * 10**3, "ms")

print(f"The performance metrics calculated for the MultinomialNB model is: ")
generate_score(y_val, y_pred)

# calculate f1 micro score
multinomial_score = metrics.f1_score(y_val, y_pred, average="binary", pos_label=1)

# 4. SGDClassifier Model

In [None]:
# Instantiate the SGD classifier
sgd_clf = SGDClassifier(loss='log_loss', penalty='l2', max_iter=10, alpha=0.001, learning_rate='constant', eta0=0.01, shuffle=True, random_state=64, warm_start=False)

start_time = time.time()
# Fit the SGD Classifier model on the training data
sgd_clf.fit(X_train, y_train)

#Predict the model results
y_pred = sgd_clf.predict(X_val)

end_time = time.time()
# print the time taken for the execution of this model in milliseconds
print("The time taken to execute this model is : ", (end_time - start_time) * 10**3, "ms")

# Calculate Model Metrics
print(f"The performance metrics calculated for the SGDClassifier model is: ")
generate_score(y_val, y_pred)

# calculate f1 micro score
sgd_clf_score = metrics.f1_score(y_val, y_pred, average="binary", pos_label=1)

In [None]:
# cross-validation
cv_scores_log_reg = cross_val_score(log_reg, X_train, y_train, cv=5)
cv_scores_perceptron = cross_val_score(perceptron, X_train, y_train, cv=5)
cv_scores_multinomial = cross_val_score(multinomial, X_train, y_train, cv=5)

# Average cross-validation scores
average_cv_scores_log_reg = cv_scores_log_reg.mean()
average_cv_scores_perceptron = cv_scores_perceptron.mean()
average_cv_scores_multinomial = cv_scores_multinomial.mean()

# Print the Cross validation scores
print(f"The average cross-validation score of Logistic Regression model is: {average_cv_scores_log_reg:.4f}")
print(f"The average cross-validation score of Perceptron model is: {average_cv_scores_perceptron:.4f}")
print(f"The average cross-validation score of MultinomialNB model is: {average_cv_scores_multinomial:.4f}")

# Print the f1 scores
print(f"The score of Logistic Regression model is: {log_reg_score:.4f}")
print(f"The score of Perceptron model is: {perceptron_score:.4f}")
print(f"The score of MultinomialNB model is: {multinomial_score:.4f}")

In [None]:
submission = pd.DataFrame(columns = ['id', 'sentiment'])
submission['id'] = [i for i in range(len(prediction))]
submission['sentiment'] = prediction
submission.to_csv('submission.csv', index = False)