In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy 

In [25]:
df = pd.read_csv('Dataset/IMDB Dataset.csv')

In [26]:
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [27]:
print(df.columns)
print("Info about the dataset:")
print(df.info())

print("Descriptive statistics:")
print(df.describe())

print("Missing values in each column:")
print(df.isnull().sum())

Index(['review', 'sentiment'], dtype='object')
Info about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
Descriptive statistics:
                                                   review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000
Missing values in each column:
review       0
sentiment    0
dtype: int64


In [28]:
print(df['sentiment'].value_counts())

print("Sample positive review:")
print(df[df['sentiment'] == 'positive']['review'].iloc[0])
print("\nSample negative review:")
print(df[df['sentiment'] == 'negative']['review'].iloc[0])

df['review_length'] = df['review'].str.len()
print("Average review length:", df['review_length'].mean())
print("Shortest review length:", df['review_length'].min())
print("Longest review length:", df['review_length'].max())

# Check first few reviews for HTML or special patterns
for i in range(3):
    print(f"Review {i}:", df['review'].iloc[i][:200], "...")



sentiment
positive    25000
negative    25000
Name: count, dtype: int64
Sample positive review:
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agr

In [29]:
nlp = spacy.load('en_core_web_sm')

In [42]:
df_cleaned = df.copy()
i = 5
print(df_cleaned['review'].iloc[i])
print(df_cleaned['sentiment'].iloc[i])

Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it's not preachy or boring. It just never gets old, despite my having seen it some 15 or more times in the last 25 years. Paul Lukas' performance brings tears to my eyes, and Bette Davis, in one of her very few truly sympathetic roles, is a delight. The kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. And the mother's slow awakening to what's happening in the world and under her own roof is believable and startling. If I had a dozen thumbs, they'd all be "up" for this movie.
positive


In [43]:
import re

def remove_html_tags(text):
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text

def preprocess_text(doc):
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
    return ' '.join(tokens)

df_cleaned['review'] = df_cleaned['review'].apply(remove_html_tags)
docs = list(nlp.pipe(df_cleaned['review']))
df_cleaned['review'] = [preprocess_text(doc) for doc in docs]


In [None]:
print(df_cleaned['review'])


0        reviewer mention watch 1 oz episode hook right...
1        wonderful little production filming technique ...
2        think wonderful way spend time hot summer week...
3        basically family little boy jake think zombie ...
4        petter mattei love time money visually stunnin...
                               ...                        
49995    think movie right good job creative original e...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic teach parochial elementary school nun...
49998    go disagree previous comment maltin second rat...
49999    expect star trek movie high art fan expect mov...
Name: review, Length: 50000, dtype: object


In [45]:
df_cleaned.to_csv('Dataset/IMDB_Dataset_Cleaned.csv', index=False)

In [None]:
print(df.head())

print(df_cleaned.head())


                                              review sentiment  review_length
0  One of the other reviewers has mentioned that ...  positive           1761
1  A wonderful little production. <br /><br />The...  positive            998
2  I thought this was a wonderful way to spend ti...  positive            926
3  Basically there's a family where a little boy ...  negative            748
4  Petter Mattei's "Love in the Time of Money" is...  positive           1317
                                              review sentiment  review_length
0  reviewer mention watch 1 oz episode hook right...  positive            935
1  wonderful little production filming technique ...  positive            572
2  think wonderful way spend time hot summer week...  positive            489
3  basically family little boy jake think zombie ...  negative            379
4  petter mattei love time money visually stunnin...  positive            768
