In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
# Load dataset
data = pd.read_csv('IMDB Dataset.csv')

In [3]:
# Display the first few rows of the dataset
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:

# Display the shape of the dataset
data.shape

(50000, 2)

In [5]:
# Check for missing values
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
# Encode sentiment labels as integers (0 and 1)
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])

In [7]:
# Display the first few rows after label encoding
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [8]:
# Function to preprocess text by removing HTML tags
def preprocess_text(text):   
    text = re.sub(r'<.*?>', '', text)
    return text

# Apply text preprocessing to the review column
data['review'] = data['review'].apply(preprocess_text)
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=2500)
X = vectorizer.fit_transform(data['review']).toarray()

In [10]:
# Define the target variable
y = data['sentiment']

In [11]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Initialize and train the Gaussian Naive Bayes model
model = GaussianNB()
model.fit(x_train, y_train)


In [13]:

# Make predictions on the test set
predictions = model.predict(x_test)

In [14]:

# Evaluate the model
print('Model Accuracy: ', accuracy_score(y_test, predictions))
print('Confusion Matrix: ', confusion_matrix(y_test, predictions))
print('Classification Report:\n', classification_report(y_test, predictions))

Model Accuracy:  0.8209
Confusion Matrix:  [[4089  872]
 [ 919 4120]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82      4961
           1       0.83      0.82      0.82      5039

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000



In [None]:
# Initialize and train the Random Forest model
model1 = RandomForestClassifier(random_state=42, class_weight='balanced')
model1.fit(x_train, y_train)

In [None]:
# Initialize and train the Random Forest model
model1 = RandomForestClassifier(random_state=42, class_weight='balanced')
model1.fit(x_train, y_train)