In [141]:
import pandas as pd

Preprocessing Data

In [142]:
import re
import nltk
import string
from nltk.tokenize import TreebankWordTokenizer

In [143]:
dataset = pd.read_csv('train_dataset/final_train_dataset.csv')

Change column names to lowercase.

In [144]:
dataset.columns = dataset.columns.str.lower()

Checking for null values in the dataset

In [145]:
dataset.isnull().any()

reviews    False
class      False
dtype: bool

In [146]:
dataset

Unnamed: 0,reviews,class
0,"Sorry everyone,,, I know this is supposed to b...",1
1,When I was little my parents took me along to ...,1
2,This film is mediocre at best. Angie Harmon is...,1
3,This film is one giant pant load. Paul Schrade...,1
4,This movie must be in line for the most boring...,1
...,...,...
24995,...this is a classic with so many great dialog...,10
24996,The most hillarious and funny Brooks movie I e...,10
24997,"Life stinks is a parody of life and death, hap...",10
24998,This is the kind of film you want to see with ...,10


In [147]:
dataset.reviews.iloc[0]

'Sorry everyone,,, I know this is supposed to be an "art" film,, but wow, they should have handed out guns at the screening so people could blow their brains out and not watch. Although the scene design and photographic direction was excellent, this story is too painful to watch. The absence of a sound track was brutal. The loooonnnnng shots were too long. How long can you watch two people just sitting there and talking? Especially when the dialogue is two people complaining. I really had a hard time just getting through this film. The performances were excellent, but how much of that dark, sombre, uninspired, stuff can you take? The only thing i liked was Maureen Stapleton and her red dress and dancing scene. Otherwise this was a ripoff of Bergman. And i\'m no fan f his either. I think anyone who says they enjoyed 1 1/2 hours of this is,, well, lying.'

Lowercase all review data

In [148]:
def lower_casing(text):
    text = text.lower()
    return text

round1= lambda x: lower_casing(x)
lower_case_dataset = pd.DataFrame(dataset.reviews.apply(round1))

Removing all punctuations

In [149]:
def remove_punctuations(text):
    words = text.split()
    table = str.maketrans("","",string.punctuation)
    stripped = [w.translate(table) for w in words]
    assembled = " ".join(stripped)
    return assembled

round2= lambda x: remove_punctuations(x)

In [150]:
removed_punctuation_dataset =  pd.DataFrame(lower_case_dataset.reviews.apply(round2))

In [151]:
removed_punctuation_dataset

Unnamed: 0,reviews
0,sorry everyone i know this is supposed to be a...
1,when i was little my parents took me along to ...
2,this film is mediocre at best angie harmon is ...
3,this film is one giant pant load paul schrader...
4,this movie must be in line for the most boring...
...,...
24995,this is a classic with so many great dialogs a...
24996,the most hillarious and funny brooks movie i e...
24997,life stinks is a parody of life and death happ...
24998,this is the kind of film you want to see with ...


Removing words with numbers

In [152]:
def remove_words_with_numbers(text):
    text = re.sub('\w*\d\w*', "",text)
    return text

round3= lambda x: remove_words_with_numbers(x)

In [153]:
remove_words_with_numbers_dataset =  pd.DataFrame(removed_punctuation_dataset.reviews.apply(round3))

In [154]:
remove_words_with_numbers_dataset

Unnamed: 0,reviews
0,sorry everyone i know this is supposed to be a...
1,when i was little my parents took me along to ...
2,this film is mediocre at best angie harmon is ...
3,this film is one giant pant load paul schrader...
4,this movie must be in line for the most boring...
...,...
24995,this is a classic with so many great dialogs a...
24996,the most hillarious and funny brooks movie i e...
24997,life stinks is a parody of life and death happ...
24998,this is the kind of film you want to see with ...


Remove Tags and Html Tags

In [155]:
import re

In [156]:
def cleanhtml(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

round4= lambda x: cleanhtml(x)

In [157]:
remove_words_without_html =  pd.DataFrame(remove_words_with_numbers_dataset.reviews.apply(round4))
remove_words_without_html

Unnamed: 0,reviews
0,sorry everyone i know this is supposed to be a...
1,when i was little my parents took me along to ...
2,this film is mediocre at best angie harmon is ...
3,this film is one giant pant load paul schrader...
4,this movie must be in line for the most boring...
...,...
24995,this is a classic with so many great dialogs a...
24996,the most hillarious and funny brooks movie i e...
24997,life stinks is a parody of life and death happ...
24998,this is the kind of film you want to see with ...


Tokenization

In [158]:
import nltk

In [159]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()

remove_words_without_html['reviews_tokenized'] =remove_words_without_html['reviews']

In [160]:
def tokenize_words(text):
    tokenized_text = tokenizer.tokenize(text)
    return tokenized_text

remove_words_without_html['reviews_tokenized'] = remove_words_without_html['reviews'].apply(lambda x: tokenize_words(x))

In [161]:
remove_words_without_html

Unnamed: 0,reviews,reviews_tokenized
0,sorry everyone i know this is supposed to be a...,"[sorry, everyone, i, know, this, is, supposed,..."
1,when i was little my parents took me along to ...,"[when, i, was, little, my, parents, took, me, ..."
2,this film is mediocre at best angie harmon is ...,"[this, film, is, mediocre, at, best, angie, ha..."
3,this film is one giant pant load paul schrader...,"[this, film, is, one, giant, pant, load, paul,..."
4,this movie must be in line for the most boring...,"[this, movie, must, be, in, line, for, the, mo..."
...,...,...
24995,this is a classic with so many great dialogs a...,"[this, is, a, classic, with, so, many, great, ..."
24996,the most hillarious and funny brooks movie i e...,"[the, most, hillarious, and, funny, brooks, mo..."
24997,life stinks is a parody of life and death happ...,"[life, stinks, is, a, parody, of, life, and, d..."
24998,this is the kind of film you want to see with ...,"[this, is, the, kind, of, film, you, want, to,..."


Removing Stopwords

In [162]:
stopwords = nltk.corpus.stopwords.words('english')

In [163]:
def removing_stopwords(text):
    text_clean = [word for word in text if word not in stopwords]
    return text_clean

In [164]:
remove_words_without_html['removed_stopwords'] = remove_words_without_html['reviews_tokenized'].apply(lambda x: removing_stopwords(x))

In [165]:
remove_words_without_html

Unnamed: 0,reviews,reviews_tokenized,removed_stopwords
0,sorry everyone i know this is supposed to be a...,"[sorry, everyone, i, know, this, is, supposed,...","[sorry, everyone, know, supposed, art, film, w..."
1,when i was little my parents took me along to ...,"[when, i, was, little, my, parents, took, me, ...","[little, parents, took, along, theater, see, i..."
2,this film is mediocre at best angie harmon is ...,"[this, film, is, mediocre, at, best, angie, ha...","[film, mediocre, best, angie, harmon, funny, b..."
3,this film is one giant pant load paul schrader...,"[this, film, is, one, giant, pant, load, paul,...","[film, one, giant, pant, load, paul, schrader,..."
4,this movie must be in line for the most boring...,"[this, movie, must, be, in, line, for, the, mo...","[movie, must, line, boring, movie, years, even..."
...,...,...,...
24995,this is a classic with so many great dialogs a...,"[this, is, a, classic, with, so, many, great, ...","[classic, many, great, dialogs, scenes, nobody..."
24996,the most hillarious and funny brooks movie i e...,"[the, most, hillarious, and, funny, brooks, mo...","[hillarious, funny, brooks, movie, ever, seen,..."
24997,life stinks is a parody of life and death happ...,"[life, stinks, is, a, parody, of, life, and, d...","[life, stinks, parody, life, death, happiness,..."
24998,this is the kind of film you want to see with ...,"[this, is, the, kind, of, film, you, want, to,...","[kind, film, want, see, glass, wine, fire, fee..."


In [166]:
removed_stopwords_dataset = remove_words_without_html.join(dataset['class']) 
# del removed_stopwords_dataset['reviews_tokenized']
removed_stopwords_dataset

Unnamed: 0,reviews,reviews_tokenized,removed_stopwords,class
0,sorry everyone i know this is supposed to be a...,"[sorry, everyone, i, know, this, is, supposed,...","[sorry, everyone, know, supposed, art, film, w...",1
1,when i was little my parents took me along to ...,"[when, i, was, little, my, parents, took, me, ...","[little, parents, took, along, theater, see, i...",1
2,this film is mediocre at best angie harmon is ...,"[this, film, is, mediocre, at, best, angie, ha...","[film, mediocre, best, angie, harmon, funny, b...",1
3,this film is one giant pant load paul schrader...,"[this, film, is, one, giant, pant, load, paul,...","[film, one, giant, pant, load, paul, schrader,...",1
4,this movie must be in line for the most boring...,"[this, movie, must, be, in, line, for, the, mo...","[movie, must, line, boring, movie, years, even...",1
...,...,...,...,...
24995,this is a classic with so many great dialogs a...,"[this, is, a, classic, with, so, many, great, ...","[classic, many, great, dialogs, scenes, nobody...",10
24996,the most hillarious and funny brooks movie i e...,"[the, most, hillarious, and, funny, brooks, mo...","[hillarious, funny, brooks, movie, ever, seen,...",10
24997,life stinks is a parody of life and death happ...,"[life, stinks, is, a, parody, of, life, and, d...","[life, stinks, parody, life, death, happiness,...",10
24998,this is the kind of film you want to see with ...,"[this, is, the, kind, of, film, you, want, to,...","[kind, film, want, see, glass, wine, fire, fee...",10


Parts of speech Tags

In [167]:
def pos_tagging(text):
    posText = nltk.pos_tag(text, tagset = 'universal')
    return posText

In [168]:
removed_stopwords_dataset['POS_Tags'] =  removed_stopwords_dataset['removed_stopwords'].apply(lambda x: pos_tagging(x))

In [169]:
removed_stopwords_dataset

Unnamed: 0,reviews,reviews_tokenized,removed_stopwords,class,POS_Tags
0,sorry everyone i know this is supposed to be a...,"[sorry, everyone, i, know, this, is, supposed,...","[sorry, everyone, know, supposed, art, film, w...",1,"[(sorry, ADJ), (everyone, NOUN), (know, VERB),..."
1,when i was little my parents took me along to ...,"[when, i, was, little, my, parents, took, me, ...","[little, parents, took, along, theater, see, i...",1,"[(little, ADJ), (parents, NOUN), (took, VERB),..."
2,this film is mediocre at best angie harmon is ...,"[this, film, is, mediocre, at, best, angie, ha...","[film, mediocre, best, angie, harmon, funny, b...",1,"[(film, NOUN), (mediocre, NOUN), (best, ADJ), ..."
3,this film is one giant pant load paul schrader...,"[this, film, is, one, giant, pant, load, paul,...","[film, one, giant, pant, load, paul, schrader,...",1,"[(film, NOUN), (one, NUM), (giant, NOUN), (pan..."
4,this movie must be in line for the most boring...,"[this, movie, must, be, in, line, for, the, mo...","[movie, must, line, boring, movie, years, even...",1,"[(movie, NOUN), (must, VERB), (line, NOUN), (b..."
...,...,...,...,...,...
24995,this is a classic with so many great dialogs a...,"[this, is, a, classic, with, so, many, great, ...","[classic, many, great, dialogs, scenes, nobody...",10,"[(classic, ADJ), (many, ADJ), (great, ADJ), (d..."
24996,the most hillarious and funny brooks movie i e...,"[the, most, hillarious, and, funny, brooks, mo...","[hillarious, funny, brooks, movie, ever, seen,...",10,"[(hillarious, ADJ), (funny, ADJ), (brooks, NOU..."
24997,life stinks is a parody of life and death happ...,"[life, stinks, is, a, parody, of, life, and, d...","[life, stinks, parody, life, death, happiness,...",10,"[(life, NOUN), (stinks, NOUN), (parody, VERB),..."
24998,this is the kind of film you want to see with ...,"[this, is, the, kind, of, film, you, want, to,...","[kind, film, want, see, glass, wine, fire, fee...",10,"[(kind, NOUN), (film, NOUN), (want, VERB), (se..."
