# **Sentiment Analysis Case Study**

https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.shape

(50000, 2)

# **Text Cleaning**

In [None]:
# Taking only 10K rows

df = df.sample(10000)

In [None]:
df.shape

(10000, 2)

In [None]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
negative,5036
positive,4964


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 24323 to 42661
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [None]:
df['sentiment'] = df['sentiment'].map({'positive':1 , 'negative':0})

In [None]:
df.head()

Unnamed: 0,review,sentiment
24323,Note: I've tried not to give away any importan...,1
12246,"Reese Witherspoon plays Dani, a young country ...",1
47266,"This is ""realism""? If Rivette was seeking to g...",0
861,"It isn't TOO bad, but ultimately it lacks the ...",0
12520,Jacques Audiard's directorial debut See How Th...,1


In [None]:
# Function to clean html tags
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [None]:
df['review'] = df['review'].apply(clean_html)

In [None]:
# converting everything to lower
def convert_lower(text):
    return text.lower()

In [None]:
df['review'] = df['review'].apply(convert_lower)

In [None]:
# function to remove special characters
def remove_special(text):

   x = ''

   for i in text:
     if i.isalnum():
       x = x + i
     else:
       x = x + ' '
   return x

In [None]:
df['review'] = df['review'].apply(remove_special)

In [None]:
# Remove the stop words
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [None]:
def remove_stopwords(text):

    x = []

    for i in text.split():
      if i not in stopwords.words('english'):
        x.append(i)
    return x

In [None]:
df['review'] = df['review'].apply(remove_stopwords)

In [None]:
df

Unnamed: 0,review,sentiment
24323,"[note, tried, give, away, important, plot, twi...",1
12246,"[reese, witherspoon, plays, dani, young, count...",1
47266,"[realism, rivette, seeking, give, us, ground, ...",0
861,"[bad, ultimately, lacks, quality, australian, ...",0
12520,"[jacques, audiard, directorial, debut, see, fa...",1
...,...,...
28308,"[gave, 2, instead, 1, think, wild, women, wong...",0
691,"[first, realize, 1, rating, supposed, reserved...",0
44296,"[deanna, durbin, really, save, universal, bank...",1
26072,"[know, film, version, jeff, saw, entire, film,...",1


In [None]:
# Perform stemming

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem_words(text):

    y = []

    for i in text:
        y.append(ps.stem(i))
    return y

In [None]:
df['review'] = df['review'].apply(stem_words)

In [None]:
# Join back

def join_back(list_input):
    return " ".join(list_input)

In [None]:
df['review'] = df['review'].apply(join_back)

In [None]:
df

Unnamed: 0,review,sentiment
24323,note tri give away import plot twist end conce...,1
12246,rees witherspoon play dani young countri girl ...,1
47266,realism rivett seek give us ground level studi...,0
861,bad ultim lack qualiti australian seri joke fa...,0
12520,jacqu audiard directori debut see fall aka reg...,1
...,...,...
28308,gave 2 instead 1 think wild women wongo wors e...,0
691,first realiz 1 rate suppos reserv worst worst ...,0
44296,deanna durbin realli save univers bankruptci e...,1
26072,know film version jeff saw entir film awesom p...,1


# **Vectorization**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)

In [None]:
X = cv.fit_transform(df['review']).toarray()

In [None]:
X.shape

(10000, 2500)

In [None]:
y = df['sentiment']
y.shape

(10000,)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
X_train.shape , X_test.shape

((8000, 2500), (2000, 2500))

# **Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

In [None]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

In [None]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print("Gaussian Naive Bayes Accuracy" , accuracy_score(y_test,y_pred1))
print("Multinomial Naive Bayes Accuracy" , accuracy_score(y_test,y_pred2))
print("Bernaulli Naive Bayes Accuracy" , accuracy_score(y_test,y_pred3))

Gaussian Naive Bayes Accuracy 0.7445
Multinomial Naive Bayes Accuracy 0.858
Bernaulli Naive Bayes Accuracy 0.863


# **Prediction on Single Text**

In [None]:
temp_df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
review = temp_df['review'][555]
review

'I went to see "Passion of Mind" because I usually get a kick out of the genre of alternate reality romances, i.e. "Sliding Doors," "Me, Myself, I," etc. <br /><br />But this was the worst one I\'ve ever seen! I had to force myself to sit through it. I didn\'t even stay through the credits which is unheard of for me.<br /><br />The magical realism was completely missing because Demi Moore was grim and the lovers she was two-timing were guys who usually play villains, though each was kind of sexy and appealing.<br /><br />There was actually a psychological explanation provided for the dual lives, with a distasteful frisson of The Elektra Complex; maybe the magic shouldn\'t be explained for this genre to work.<br /><br />(originally written 5/28/2000)'

In [None]:
review = clean_html(review)
review = convert_lower(review)
review = remove_special(review)
review = remove_stopwords(review)
review = stem_words(review)
review = join_back(review)

In [None]:
review

'went see passion mind usual get kick genr altern realiti romanc e slide door etc worst one ever seen forc sit even stay credit unheard magic realism complet miss demi moor grim lover two time guy usual play villain though kind sexi appeal actual psycholog explan provid dual live distast frisson elektra complex mayb magic explain genr work origin written 5 28 2000'

In [None]:
vector_rev = cv.transform([review]).toarray()
vector_rev.shape

(1, 2500)

In [None]:
sentiment = clf3.predict(vector_rev)
if sentiment == 1:
    print("Positive Review")
else:
    print("Negative Review")

Negative Review
