In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sentence_transformers import SentenceTransformer 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout, Dense, Embedding


from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


  from .autonotebook import tqdm as notebook_tqdm





In [None]:

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elora\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\elora\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\elora\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\elora\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Data Collection and Analysis

In [None]:
data = pd.read_csv('Book1.csv')

print(data.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [4]:
data['review'].duplicated().sum()

np.int64(0)

In [None]:
duplicates = data[data['review'].duplicated()]

duplicates


Unnamed: 0,review,sentiment


In [6]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,review,sentiment
count,1999,1999
unique,1999,2
top,I loved this movie! It was all I could do not ...,positive
freq,1,1005


In [8]:
data['sentiment'].value_counts()

sentiment
positive    1005
negative     994
Name: count, dtype: int64

# Data Cleaning and Preprocessing


In [None]:
data = data.drop_duplicates(subset=['review'])

In [10]:
data['review'].duplicated().sum()

np.int64(0)

In [11]:
def clean_text(text):
    # Lowercase
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove URLs
    text = re.sub(r'https?://\S+', '', text)

    # Remove special characters (punctuations) and numbers
    text = re.sub(r"[^a-zA-Z\s]", ' ', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)

    # Tokenization (word-level)
    word_tokens = word_tokenize(text)  # Split into words

    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in word_tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]

    # Combine the final processed words into a single string
    final_text = ' '.join(lemmatized_words)
    return final_text

# Apply the cleaning function to the review column
data['cleaned_review'] = data['review'].apply(clean_text)


In [12]:
print(data.head())

                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                      cleaned_review  
0  one review mention watch oz episod hook right ...  
1  wonder littl product film techniqu unassum old...  
2  thought wonder way spend time hot summer weeke...  
3  basic famili littl boy jake think zombi closet...  
4  petter mattei love time money visual stun film...  


# Text Representation

In [None]:
reviews = data['cleaned_review']
sentiments = data['sentiment']

In [None]:
bow_vectorizer = CountVectorizer() 
bow_features = bow_vectorizer.fit_transform(reviews)

print("BoW Shape:", bow_features.shape)
print("Sample BoW Vector:", bow_features[0].toarray())

BoW Shape: (1999, 17155)
Sample BoW Vector: [[0 0 0 ... 0 0 0]]


In [None]:
tfidf_vectorizer = TfidfVectorizer() 
tfidf_features = tfidf_vectorizer.fit_transform(reviews)

print("TF-IDF Shape:", tfidf_features.shape)
print("Sample TF-IDF Vector:", tfidf_features[0].toarray())

TF-IDF Shape: (1999, 17155)
Sample TF-IDF Vector: [[0. 0. 0. ... 0. 0. 0.]]


In [None]:
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2))
ngram_features = ngram_vectorizer.fit_transform(reviews)

print("N-Gram Shape:", ngram_features.shape)
print("Sample N-Gram Vector:", ngram_features[0].toarray())

N-Gram Shape: (1999, 177377)
Sample N-Gram Vector: [[0 0 0 ... 0 0 0]]


In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2') 

data['embeddings'] = data['cleaned_review'].apply(lambda x: model.encode(x))

review_embeddings = np.array(data['embeddings'].tolist())
print("Embeddings Shape:", review_embeddings.shape)

Embeddings Shape: (1999, 384)


# Padding

In [None]:
tokenizer = Tokenizer(num_words=5000)

tokenizer.fit_on_texts(data['cleaned_review'])

sequences = tokenizer.texts_to_sequences(data['cleaned_review'])

max_length = 500

padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Model Building

In [None]:
vocab_size = len(tokenizer.word_index) + 1
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])



# Train the model

In [None]:
X = padded_sequences

y = (data['sentiment'] == 'positive').astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

print(f"Training Data Shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing Data Shape: X_test={X_test.shape}, y_test={y_test.shape}")

Training Data Shape: X_train=(1199, 500), y_train=(1199,)
Testing Data Shape: X_test=(800, 500), y_test=(800,)


In [None]:
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)

history = model.fit(X_train, y_train,
                    epochs=15,
                    batch_size=32,
                    validation_split=0.4,
                    callbacks=[early_stopping, lr_scheduler],
                    verbose=1)

print(model.summary())

Epoch 1/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 439ms/step - accuracy: 0.8906 - loss: 0.3097 - val_accuracy: 0.7729 - val_loss: 0.4833 - learning_rate: 0.0010
Epoch 2/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 448ms/step - accuracy: 0.9618 - loss: 0.1581 - val_accuracy: 0.8021 - val_loss: 0.4788 - learning_rate: 0.0010
Epoch 3/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 448ms/step - accuracy: 0.9868 - loss: 0.0922 - val_accuracy: 0.8062 - val_loss: 0.5250 - learning_rate: 0.0010
Epoch 4/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 414ms/step - accuracy: 0.9967 - loss: 0.0386
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 476ms/step - accuracy: 0.9966 - loss: 0.0386 - val_accuracy: 0.8083 - val_loss: 0.5945 - learning_rate: 0.0010
Epoch 5/15
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

None


# Evaluate the model

In [None]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)

y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)s

accuracy = accuracy_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes)
f1 = f1_score(y_test, y_pred_classes)

print(f"Test Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 83ms/step
Test Accuracy: 0.84
Precision: 0.84
Recall: 0.85
F1-Score: 0.85
