In [90]:
import numpy as np
import pandas as pd
import re

In [91]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [92]:
df['review'][5]

'Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. It just never gets old, despite my having seen it some 15 or more times in the last 25 years. Paul Lukas\' performance brings tears to my eyes, and Bette Davis, in one of her very few truly sympathetic roles, is a delight. The kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. And the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. If I had a dozen thumbs, they\'d all be "up" for this movie.'

# Data Preprocessing

### Convert to lower case

In [93]:
df['review'] = df['review'].apply(lambda x: x.lower())

### Remove html tags

In [94]:

def clean_text_remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'',text)
    
    

In [95]:
df['review'] = df['review'].apply(clean_text_remove_html_tags)

In [96]:
df['review'][5]

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

In [97]:
def remove_slash(text):
    pattern = re.compile(r"\\'")
    return pattern.sub('',text)

In [98]:
df['review'] = df['review'].apply(remove_slash)

### Url removing

In [99]:
def remove_url(text):
    pattern = re.compile('http[s]?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [100]:
df['review'] = df['review'].apply(remove_url)

### Remove putuations

In [101]:
import string
exclude = string.punctuation 
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [102]:
def remmove_punctuation(text):
    for w in text:
        if w in exclude:
            text = text.replace(w,'')
    return text

In [103]:
df['review'] = df['review'].apply(remmove_punctuation)

### spelling correction

In [104]:
# from textblob import TextBlob
# 
# df['review'].apply(lambda x: TextBlob(x).correct().string)

### Remove stop words

In [105]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stopwords(text):
    word = text.split()
    filtered_text = [word for word in word if word not in stop_words]
    return ' '.join(filtered_text)

In [106]:
df['review'] = df['review'].apply(remove_stopwords)

### Stemming

In [109]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

df['review'] = df['review'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))

In [116]:
0000

[[0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0]]