# Download dataset from the below kaggle link

https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [230]:
# import necessary libraries
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [231]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to C:\Users\Rajesh
[nltk_data]     Pabbati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Rajesh
[nltk_data]     Pabbati\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Rajesh
[nltk_data]     Pabbati\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rajesh Pabbati\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [232]:
# read the dataset
imdb = pd.read_csv('IMDB Dataset.csv')
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [233]:
df_test = imdb.head(300).copy()
df_test

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
295,Despite a totally misleading advertising campa...,negative
296,This movie sucks ass. Something about a heatwa...,negative
297,"I was looking forward to The Guardian, but whe...",positive
298,Going into see Seven Pounds i wasn't clearly s...,positive


# Text Preprocessing and Feature Engineering

In [235]:
def preprocessing_and_feature_engineering(df_test):
    # Text preprocessing:
    # Stopwords Removal
    stop_words = set(stopwords.words('english'))
    df_test['no_stopwords'] = df_test['review'].apply(lambda x: [word for word in word_tokenize(x) if word.lower() not in stop_words])
    df_test['no_stopwords'] = df_test['no_stopwords'].apply(lambda x: ' '.join(x))
    
    # Tokenization
    df_test['word_tokens'] = df_test['no_stopwords'].apply(word_tokenize)
    df_test['sentence_tokens'] = df_test['no_stopwords'].apply(sent_tokenize)
    
    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    df_test['stemmed'] = df_test['word_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
    df_test['lemmatized'] = df_test['word_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
    
    # POS Tagging
    df_test['pos_tags'] = df_test['word_tokens'].apply(pos_tag)

    # Feature Engineering:
    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectors = tfidf_vectorizer.fit_transform(df_test['no_stopwords'])
    tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    
    # One-Hot Encoding
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df_test[['no_stopwords']])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
    
    # Bag of Words
    count_vectorizer = CountVectorizer()
    bow_vectors = count_vectorizer.fit_transform(df_test['no_stopwords'])
    bow_df = pd.DataFrame(bow_vectors.toarray(), columns=count_vectorizer.get_feature_names_out())
    
    # Unigram, Bigram, n-gram
    unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
    bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
    ngram_vectorizer = CountVectorizer(ngram_range=(1, 3))
    
    unigram_vectors = unigram_vectorizer.fit_transform(df_test['no_stopwords'])
    bigram_vectors = bigram_vectorizer.fit_transform(df_test['no_stopwords'])
    ngram_vectors = ngram_vectorizer.fit_transform(df_test['no_stopwords'])
    
    unigram_df = pd.DataFrame(unigram_vectors.toarray(), columns=unigram_vectorizer.get_feature_names_out())
    bigram_df = pd.DataFrame(bigram_vectors.toarray(), columns=bigram_vectorizer.get_feature_names_out())
    ngram_df = pd.DataFrame(ngram_vectors.toarray(), columns=ngram_vectorizer.get_feature_names_out())
    
    return df_test, tfidf_df, one_hot_df, bow_df, unigram_df, bigram_df, ngram_df

In [236]:
df_test_res, tfidf_df_res, one_hot_df_res, bow_df_res, unigram_df_res, bigram_df_res, ngram_df_res = preprocessing_and_feature_engineering(df_test)

In [237]:
df_test_res.head()

Unnamed: 0,review,sentiment,no_stopwords,word_tokens,sentence_tokens,stemmed,lemmatized,pos_tags
0,One of the other reviewers has mentioned that ...,positive,One reviewers mentioned watching 1 Oz episode ...,"[One, reviewers, mentioned, watching, 1, Oz, e...",[One reviewers mentioned watching 1 Oz episode...,"[one, review, mention, watch, 1, oz, episod, '...","[One, reviewer, mentioned, watching, 1, Oz, ep...","[(One, CD), (reviewers, NNS), (mentioned, VBD)..."
1,A wonderful little production. <br /><br />The...,positive,wonderful little production . < br / > < br / ...,"[wonderful, little, production, ., <, br, /, >...","[wonderful little production ., < br / > < br ...","[wonder, littl, product, ., <, br, /, >, <, br...","[wonderful, little, production, ., <, br, /, >...","[(wonderful, JJ), (little, JJ), (production, N..."
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...",[thought wonderful way spend time hot summer w...,"[thought, wonder, way, spend, time, hot, summe...","[thought, wonderful, way, spend, time, hot, su...","[(thought, VBN), (wonderful, JJ), (way, NN), (..."
3,Basically there's a family where a little boy ...,negative,Basically 's family little boy ( Jake ) thinks...,"[Basically, 's, family, little, boy, (, Jake, ...",[Basically 's family little boy ( Jake ) think...,"[basic, 's, famili, littl, boy, (, jake, ), th...","[Basically, 's, family, little, boy, (, Jake, ...","[(Basically, NNP), ('s, POS), (family, NN), (l..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,Petter Mattei 's `` Love Time Money '' visuall...,"[Petter, Mattei, 's, ``, Love, Time, Money, ``...",[Petter Mattei 's `` Love Time Money '' visual...,"[petter, mattei, 's, ``, love, time, money, ``...","[Petter, Mattei, 's, ``, Love, Time, Money, ``...","[(Petter, NNP), (Mattei, NNP), ('s, POS), (``,..."


In [238]:
tfidf_df_res.head()

Unnamed: 0,00,000,00am,10,100,103,11,12,13,13th,...,zingers,ziyi,zombie,zombies,zone,zoo,zoom,zooms,zulu,zwick
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.062025,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.225819,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [239]:
bow_df_res.head()

Unnamed: 0,00,000,00am,10,100,103,11,12,13,13th,...,zingers,ziyi,zombie,zombies,zone,zoo,zoom,zooms,zulu,zwick
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF Dataset

In [241]:
X = tfidf_df_res
y = df_test_res['sentiment']

In [242]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Text Classification

In [244]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [245]:
y_pred = model.predict(X_test)

# Model Evaluation

In [247]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

Accuracy: 0.5253
Precision: 0.7529
Recall: 0.5253
F1 Score: 0.3724
[[51  0]
 [47  1]]


# Bag of Words Dataset

In [249]:
X = bow_df_res
y = df_test_res['sentiment']

In [250]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Text Classification

In [252]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [253]:
y_pred = model.predict(X_test)

# Model Evaluation

In [255]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

Accuracy: 0.6667
Precision: 0.6991
Recall: 0.6667
F1 Score: 0.6485
[[45  6]
 [27 21]]
