In [1]:
#import packages
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import string
import re
from nltk.corpus import stopwords 
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [2]:
#data
df = pd.read_csv("imdb/IMDB_Dataset.csv")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
#new column based on sentiment
df['sent_value'] = np.where(df['sentiment'] == 'positive', 1, 0)

In [5]:
#new column with the string length
df['text_length'] = [len(i) for i in df.review]

In [6]:
#deleting html tags
df['review'] = [BeautifulSoup(i).get_text() for i in df['review'] ]

In [7]:
#1st type of dicitionary for negations
negations = {"isn\'t":"is not", "aren\'t":"are not", "wasn\'t":"was not", "weren\'t":"were not",
                "haven\'t":"have not","hasn\'t":"has not","hadn\'t":"had not","won\'t":"will not",
                "wouldn\'t":"would not", "don\'t":"do not", "doesn\'t":"does not","didn\'t":"did not",
                "can\'t":"can not","couldn\'t":"could not","shouldn\'t":"should not","mightn\'t":"might not",
                "mustn\'t":"must not"}

In [8]:
#2nd type of dicitionary for negations
negations_2 = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}

In [9]:
#negations function 1st type
def negations_func(review):
    for word in review.split():
        if word.lower() in negations:
            review = review.replace(word, negations[word.lower()])
    return review

In [10]:
#negations function 2nd type
def negations_func_2(review):
    for word in review.split():
        if word.lower() in negations_2:
            review = review.replace(word, negations_2[word.lower()])
    return review

In [11]:
#replaced the negations with the form from above 
df['review_new'] = [negations_func(i) for i in df['review']]
df['review_new'] = [negations_func_2(i) for i in df['review']]

In [12]:
#Delete everything besides letters
df['review_new'] = [re.sub("[^a-zA-Z]", " ", i) for i in df['review_new'] ]
#Lowercase
df['review_new'] = [i.lower() for i in df['review_new']]

In [13]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Removing stopwords

In [14]:
stop_words = set(stopwords.words('english')) 

In [15]:
stop_words;

In [16]:
word_tokenize(df.review_new[3]);

In [17]:
#function for filtering stopwords
def wordTokenize(review):
    tokens_no_stopwords = [w for w in word_tokenize(review) if not w in stop_words]
    filtered_sentence = (" ").join(tokens_no_stopwords)
    return filtered_sentence

In [18]:
df['tokenized'] = [wordTokenize(i) for i in df['review_new']]

In [19]:
df = df.drop(['review','review_new'],axis=1)

In [20]:
df['tokenized_backup'] = df['tokenized']

In [21]:
df = df.drop(['tokenized_backup'],axis=1)

In [22]:
df

Unnamed: 0,sentiment,sent_value,text_length,tokenized
0,positive,1,1761,one reviewers mentioned watching oz episode ho...
1,positive,1,998,wonderful little production filming technique ...
2,positive,1,926,thought wonderful way spend time hot summer we...
3,negative,0,748,basically family little boy jake thinks zombie...
4,positive,1,1317,petter mattei love time money visually stunnin...
...,...,...,...,...
49995,positive,1,1008,thought movie right good job creative original...
49996,negative,0,642,bad plot bad dialogue bad acting idiotic direc...
49997,negative,0,1280,catholic taught parochial elementary schools n...
49998,negative,0,1234,going disagree previous comment side maltin on...


In [24]:
x = df.tokenized
y = df.sent_value

In [211]:
#splitting data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.1, random_state = 100)

Term Frequency Calculation

In [212]:
cvec = CountVectorizer()
cvec.fit(x_train)

CountVectorizer()

In [213]:
negative_terms = cvec.transform(x_train[y_train == 0])
positive_words = cvec.transform(x_train[y_train == 1])

In [214]:
positive_words

<22503x96902 sparse matrix of type '<class 'numpy.int64'>'
	with 2199673 stored elements in Compressed Sparse Row format>

In [215]:
neg_tf = np.sum(negative_terms,axis=0)
pos_tf = np.sum(positive_words,axis=0)

In [216]:
pos_tf

matrix([[6, 4, 0, ..., 0, 0, 0]], dtype=int64)

In [217]:
#convert matrix into array and reduce the dimension of it
neg = np.squeeze(np.asarray(neg_tf))
pos = np.squeeze(np.asarray(pos_tf))

In [218]:
pos

array([6, 4, 0, ..., 0, 0, 0], dtype=int64)

In [219]:
df_tf = pd.DataFrame([neg,pos],columns=cvec.get_feature_names()).transpose()

In [220]:
df_tf.columns = ['negative', 'positive']

In [221]:
df_tf['total'] = df_tf['negative'] + df_tf['positive']

In [222]:
#df_tf = df_tf.sort_values(by='total', ascending=False)

In [223]:
df_tf

Unnamed: 0,negative,positive,total
aa,10,6,16
aaa,9,4,13
aaaaaaaaaaaahhhhhhhhhhhhhh,1,0,1
aaaaaaaargh,0,1,1
aaaaaaah,1,0,1
...,...,...,...
zzzzzzzzzzzzpop,1,0,1
zzzzzzzzzzzzz,2,0,2
zzzzzzzzzzzzzzzzzz,1,0,1
zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,1,0,1


In [224]:
df_tf_backup = df_tf

In [225]:
df_tf['positive_rate'] = df_tf['positive']/df_tf['total']
df_tf['negative_rate'] = df_tf['negative']/df_tf['total']

In [226]:
df_tf = df_tf.sort_values(by='positive_rate', ascending=False)

In [227]:
df_tf

Unnamed: 0,negative,positive,total,positive_rate,negative_rate
finisterre,0,1,1,1.0,0.0
signboard,0,1,1,1.0,0.0
gdr,0,1,1,1.0,0.0
signage,0,1,1,1.0,0.0
geart,0,1,1,1.0,0.0
...,...,...,...,...,...
hannay,1,0,1,0.0,1.0
hanneke,1,0,1,0.0,1.0
secretions,4,0,4,0.0,1.0
hannelore,1,0,1,0.0,1.0


In [228]:
positive_rate_mean = df_tf.positive_rate
y_predicted = []

In [229]:
for p in x_test:
    p_score = [positive_rate_mean[w] for w in p.split() if w in positive_rate_mean.index]
    if len(p_score) > 0:
        prob_score = np.mean(p_score)
    else:
        prob_score = np.random.random()
    y_predicted.append(prob_score)

In [230]:
#the prediction itself
prediction = [1 if p > 0.51 else 0 for p in y_predicted]

In [231]:
#accuracy score
accuracy_score(y_test,prediction)

0.863