# Data preprocessing

In [23]:
# Import necessary libraries
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import os
# nltk.download('wordnet')

In [24]:
pwd = os.getcwd()

In [25]:
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub('<.*?>', ' ', text)
    
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and punctuation
    text = re.sub(r"[^a-zA-Z0-9\s!?.,]", '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
   
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a single string
    text = ' '.join(tokens)
    return text

In [26]:
df_train = pd.read_csv(pwd + "//final_dataset_train.csv")

In [27]:
df_train

Unnamed: 0,movie_id,movie_rating,review,label
0,tt0064354,3,Story of a man who has unnatural feelings for ...,bad
1,tt0100680,1,Robert DeNiro plays the most unbelievably inte...,bad
2,tt0100680,1,"I saw the capsule comment said ""great acting.""...",bad
3,tt0100680,4,If I had not read Pat Barker's 'Union Street' ...,bad
4,tt0047200,4,This fanciful horror flick has Vincent Price p...,bad
...,...,...,...,...
24995,tt0433360,7,What's Good About It: Some inventive and genui...,good
24996,tt0433360,8,For years we've been watching every horror fil...,good
24997,tt0324532,10,If you haven't already seen this movie of Mary...,good
24998,tt0324532,10,this movie is the best movie ever it has a lot...,good


In [28]:
# Use the created function to apply preprocessing techniques on the reviews
df_train['review'] = df_train['review'].apply(preprocess_text)

In [29]:
df_train.head()

Unnamed: 0,movie_id,movie_rating,review,label
0,tt0064354,3,story man unnatural feeling pig . start openin...,bad
1,tt0100680,1,robert deniro play unbelievably intelligent il...,bad
2,tt0100680,1,saw capsule comment said great acting . opinio...,bad
3,tt0100680,4,"read pat barker union street seeing film , wou...",bad
4,tt0047200,4,fanciful horror flick vincent price playing ma...,bad


In [30]:
#Encode the table
label = {'bad':0, 'good':1}
df_train['label'] = df_train['label'].map(label)

In [31]:
df_train

Unnamed: 0,movie_id,movie_rating,review,label
0,tt0064354,3,story man unnatural feeling pig . start openin...,0
1,tt0100680,1,robert deniro play unbelievably intelligent il...,0
2,tt0100680,1,saw capsule comment said great acting . opinio...,0
3,tt0100680,4,"read pat barker union street seeing film , wou...",0
4,tt0047200,4,fanciful horror flick vincent price playing ma...,0
...,...,...,...,...
24995,tt0433360,7,whats good inventive genuinely creepy little e...,1
24996,tt0433360,8,"year weve watching every horror film come , du...",1
24997,tt0324532,10,"havent already seen movie marykate ashleys , s...",1
24998,tt0324532,10,movie best movie ever lot live action great ev...,1


In [32]:

df_train.to_csv('preprocessed_train.csv', index=False)

In [33]:
df_train.head()

Unnamed: 0,movie_id,movie_rating,review,label
0,tt0064354,3,story man unnatural feeling pig . start openin...,0
1,tt0100680,1,robert deniro play unbelievably intelligent il...,0
2,tt0100680,1,saw capsule comment said great acting . opinio...,0
3,tt0100680,4,"read pat barker union street seeing film , wou...",0
4,tt0047200,4,fanciful horror flick vincent price playing ma...,0
