<a href="https://colab.research.google.com/github/sushithadevaraju/sentiment-analysis-of-imdb/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing file

In [None]:
from google.colab import files

uploaded = files.upload()


Saving IMDB Dataset.csv to IMDB Dataset (1).csv


Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


Load the Dataset

In [None]:
#Load the dataset
mov = pd.read_csv('IMDB Dataset.csv')

#Display the first few rows
print(mov.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


Data preprocessing

In [None]:
#Download the punkt tokenizer
nltk.download('punkt')

#Checking for null values
print(mov.isnull().sum())

#preprocessing
def preprocess_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = ' '.join(text)
    return text

#Apply preprocessing to the dataset
mov['review'] = mov['review'].apply(preprocess_text)

#Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(mov['review'], mov['sentiment'], test_size=0.2, random_state=42)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


review       0
sentiment    0
dtype: int64


Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Create TF-IDF vectors for the text data
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
nltk.download('stopwords')
nltk.download('wordnet')

# Checking for null values
print(mov.isnull().sum())

review       0
sentiment    0
dtype: int64


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing: Tokenization, Lemmatization, and Stemming

In [None]:
#Initializing tools for stemming and lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

#Preprocessing function
def preprocess_text(text):
    #Convert to lowercase
    text = text.lower()

    #Tokenization
    tokens = word_tokenize(text)

    #Removing stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]

    #Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    #Stemming
    stemmed_tokens = [stemmer.stem(word) for word in lemmatized_tokens]

    #Joining tokens back into a string
    preprocessed_text = ' '.join(stemmed_tokens)

    return preprocessed_text

#Apply preprocessing to the dataset
mov['review'] = mov['review'].apply(preprocess_text)

Model Implementation (Logistic Regression)


In [None]:
#Encode target labels
y_train = y_train.apply(lambda x: 1 if x == 'positive' else 0)
y_test = y_test.apply(lambda x: 1 if x == 'positive' else 0)

#Train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

#Predicting the model
y_pred = lr_model.predict(X_test_tfidf)

#Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8938
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



Model Implementation (LSTM)

In [None]:
#Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=200)
X_test_pad = pad_sequences(X_test_seq, maxlen=200)

#Defining LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Train the model
model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))

#Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Test Accuracy:", accuracy)


Epoch 1/5




[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 522ms/step - accuracy: 0.7218 - loss: 0.5413 - val_accuracy: 0.8439 - val_loss: 0.3737
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m332s[0m 531ms/step - accuracy: 0.8240 - loss: 0.4022 - val_accuracy: 0.8325 - val_loss: 0.3869
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 520ms/step - accuracy: 0.8580 - loss: 0.3419 - val_accuracy: 0.8645 - val_loss: 0.3356
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 519ms/step - accuracy: 0.8654 - loss: 0.3258 - val_accuracy: 0.8643 - val_loss: 0.3198
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m387s[0m 527ms/step - accuracy: 0.8850 - loss: 0.2886 - val_accuracy: 0.8754 - val_loss: 0.3069
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 117ms/step - accuracy: 0.8742 - loss: 0.3070
Test Accuracy: 0.8754000067710876


Implementing Naive Bayes Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(mov['review'], mov['sentiment'], test_size=0.2, random_state=42)


In [None]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


In [None]:
#Training the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

#Make predictions
y_pred_nb = nb_model.predict(X_test_tfidf)

#Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

Accuracy: 0.852
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      4961
           1       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix:
 [[4234  727]
 [ 753 4286]]


Implementing SVM Model

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
#Preprocessing (tokenization, lowercasing)
mov['review'] = mov['review'].str.lower()

#Splitng the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(mov['review'], mov['sentiment'], test_size=0.2, random_state=42)
#Training the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

#Making predictions
y_pred = svm_model.predict(X_test_tfidf)

#Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8941
Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4961
    positive       0.89      0.90      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Confusion Matrix:
 [[4384  577]
 [ 482 4557]]


Building Random forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
import re
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'<br />', ' ', text)  # Remove HTML tags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = nltk.word_tokenize(text)  # Tokenize the text
    text = [word for word in text if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer
    text = [lemmatizer.lemmatize(word) for word in text]  # Lemmatize words
    text = ' '.join(text)  # Rejoin tokens into a string
    return text

#Apply preprocessing to the dataset
mov['review'] = mov['review'].apply(preprocess_text)

#Preprocess target labels for binary classification
mov['sentiment'] = mov['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

#Spliting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(mov['review'], mov['sentiment'], test_size=0.2, random_state=42)
#Training the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

#Making predictions
y_pred_rf = rf_model.predict(X_test_tfidf)

#Evaluating the model
print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Random Forest
Accuracy: 0.8502
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.86      0.85      4961
           1       0.86      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix:
 [[4249  712]
 [ 786 4253]]
