In [1]:
import pandas as pd

df_imdb = pd.read_csv('IMDB Dataset.csv')
df_imdb = df_imdb.iloc[:len(df_imdb) // 5]
print(df_imdb.head())
print(df_imdb.shape)
print(df_imdb.dtypes)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
(10000, 2)
review       object
sentiment    object
dtype: object


In [2]:
nan_info = df_imdb.isnull().sum()

print("NaN values in the DataFrame:")
print(nan_info)

df_imdb_cleaned = df_imdb.dropna()

print("\nSize of the cleaned DataFrame:")
print(df_imdb_cleaned.shape)

NaN values in the DataFrame:
review       0
sentiment    0
dtype: int64

Size of the cleaned DataFrame:
(10000, 2)


In [3]:
print("First 5 reviews and their sentiment classifications:")
print(df_imdb_cleaned[['review', 'sentiment']].head())

First 5 reviews and their sentiment classifications:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [4]:
def count_words(text):
    words = text.split()
    return len(words)

df_imdb_cleaned['words count'] = df_imdb_cleaned['review'].apply(count_words)

print("DataFrame with 'words count' column:")
print(df_imdb_cleaned.head())

DataFrame with 'words count' column:
                                              review sentiment  words count
0  One of the other reviewers has mentioned that ...  positive          307
1  A wonderful little production. <br /><br />The...  positive          162
2  I thought this was a wonderful way to spend ti...  positive          166
3  Basically there's a family where a little boy ...  negative          138
4  Petter Mattei's "Love in the Time of Money" is...  positive          230


In [12]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def simple_preprocessing(text):
    text = text.lower()
    text = re.sub(r'<br\s*?/?>', ' ', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[@#]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

text_to_preprocess = "Sample text with <br/> HTML tags, URLs http://example.com, #hashtags, and @mentions. It needs cleaning!"
preprocessed_text = simple_preprocessing(text_to_preprocess)
print("Original Text:", text_to_preprocess)
print("Preprocessed Text:", preprocessed_text)

Original Text: Sample text with <br/> HTML tags, URLs http://example.com, #hashtags, and @mentions. It needs cleaning!
Preprocessed Text: sample text html tags urls hashtags mentions needs cleaning


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
df_imdb['review_cleaned'] = df_imdb['review'].apply(simple_preprocessing)
df_imdb

In [16]:
for index, row in df_imdb.head(5).iterrows():
    original_review = row['review']
    cleaned_review = row['review_cleaned']
    print(f"Original Review {index + 1}:\n{original_review}\n")
    print(f"Cleaned Review {index + 1}:\n{cleaned_review}\n")
    print("-----------------------------------------------------")

Original Review 1:
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the sho

In [17]:
duplicated_reviews = df_imdb[df_imdb.duplicated(subset=['review_cleaned'])]

print("Duplicated Reviews:")
print(duplicated_reviews[['review', 'review_cleaned']])

df_imdb_cleaned = df_imdb.drop_duplicates(subset=['review_cleaned'])

print(f"Shape of DataFrame before removing duplicates: {df_imdb.shape}")
print(f"Shape of DataFrame after removing duplicates: {df_imdb_cleaned.shape}")

Duplicated Reviews:
                                                 review  \
3537  Quite what the producers of this appalling ada...   
3769  My favourite police series of all time turns t...   
4391  Beautiful film, pure Cassavetes style. Gena Ro...   
6352  If you liked the Grinch movie... go watch that...   
6479  I want very much to believe that the above quo...   
6672  Sigh. I'm baffled when I see a short like this...   
7221  I have always been a huge fan of "Homicide: Li...   
7222  There are plenty of comments already posted sa...   
7425  The movie was excellent, save for some of the ...   
7555  This movie has made me upset! When I think of ...   
8040  Contains spoilers. <br /><br />The British dir...   
8801  Hilarious, clean, light-hearted, and quote-wor...   
8868  Ik know it is impossible to keep all details o...   
8874  From director Barbet Schroder (Reversal of For...   
9057  So, I'm wondering while watching this film, di...   
9503  This is one of those star-fill

In [19]:
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()

def stemming(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    stemmed_text = ' '.join(stemmed_tokens)
    return stemmed_text

df_imdb_cleaned['review_stemmed'] = df_imdb_cleaned['review_cleaned'].apply(stemming)

for index, row in df_imdb_cleaned.head(5).iterrows():
    cleaned_review = row['review_cleaned']
    stemmed_review = row['review_stemmed']
    print(f"Cleaned Review {index + 1}:\n{cleaned_review}\n")
    print(f"Stemmed Review {index + 1}:\n{stemmed_review}\n")
    print("-----------------------------------------------------")

Cleaned Review 1:
one reviewers mentioned watching 1 oz episode youll hooked right exactly happened first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use word called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home manyaryans muslims gangstas latinos christians italians irish moreso scuffles death stares dodgy dealings shady agreements never far away would say main appeal show due fact goes shows wouldnt dare forget pretty pictures painted mainstream audiences forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards wholl sold nickel inmates wholl kill order get away well mannered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_imdb_cleaned['review_stemmed'] = df_imdb_cleaned['review_cleaned'].apply(stemming)


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

label_binarizer = LabelBinarizer()
df_imdb_cleaned['sentiment_binary'] = label_binarizer.fit_transform(df_imdb_cleaned['sentiment'])

X = df_imdb_cleaned['review_stemmed']
Y = df_imdb_cleaned['sentiment_binary']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"Y_test shape: {Y_test.shape}")

X_train shape: (7986,)
X_test shape: (1997,)
Y_train shape: (7986,)
Y_test shape: (1997,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_imdb_cleaned['sentiment_binary'] = label_binarizer.fit_transform(df_imdb_cleaned['sentiment'])


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"X_test_tfidf shape: {X_test_tfidf.shape}")

X_train_tfidf shape: (7986, 5000)
X_test_tfidf shape: (1997, 5000)


In [23]:
from sklearn.model_selection import train_test_split

X_train_final, X_test_final, Y_train_final, Y_test_final = train_test_split(
    X_train_tfidf, Y_train, test_size=0.3, random_state=42
)

print(f"X_train_final shape: {X_train_final.shape}")
print(f"X_test_final shape: {X_test_final.shape}")
print(f"Y_train_final shape: {Y_train_final.shape}")
print(f"Y_test_final shape: {Y_test_final.shape}")

X_train_final shape: (5590, 5000)
X_test_final shape: (2396, 5000)
Y_train_final shape: (5590,)
Y_test_final shape: (2396,)


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

logistic_regression_model = LogisticRegression(random_state=42)

logistic_regression_model.fit(X_train_final, Y_train_final)

Y_pred = logistic_regression_model.predict(X_test_final)

accuracy = accuracy_score(Y_test_final, Y_pred)
print(f"Accuracy Score: {accuracy}")

conf_matrix = confusion_matrix(Y_test_final, Y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy Score: 0.8647746243739566
Confusion Matrix:
[[1003  186]
 [ 138 1069]]


In [25]:
review1 = simple_preprocessing("I loved this movie!")
review2 = simple_preprocessing("This movie was a bad comedy movie!")

review1_tfidf = tfidf_vectorizer.transform([review1])
review2_tfidf = tfidf_vectorizer.transform([review2])

sentiment1 = logistic_regression_model.predict(review1_tfidf)[0]
sentiment2 = logistic_regression_model.predict(review2_tfidf)[0]

print(f"Review 1 Sentiment: {'Positive' if sentiment1 == 1 else 'Negative'}")
print(f"Review 2 Sentiment: {'Positive' if sentiment2 == 1 else 'Negative'}")

Review 1 Sentiment: Positive
Review 2 Sentiment: Negative
