# Preprocessing Data

In [1]:
import re
import nltk
import string
from nltk.tokenize import TreebankWordTokenizer
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from matplotlib import style

In [2]:
train_data = pd.read_csv('./data/train_dataset/final_train_dataset.csv')
test_data = pd.read_csv('./data/test_dataset/final_test_dataset.csv')

In [3]:
train_data.head()

Unnamed: 0,Reviews,Class
0,"Sorry everyone,,, I know this is supposed to b...",1
1,When I was little my parents took me along to ...,1
2,This film is mediocre at best. Angie Harmon is...,1
3,This film is one giant pant load. Paul Schrade...,1
4,This movie must be in line for the most boring...,1


# Cleaning of data

 Change column names to lowercase.

In [4]:
train_data.columns = train_data.columns.str.lower()
test_data.columns = test_data.columns.str.lower()

 Checking for null values in the dataset

In [5]:
test_data.isnull().any()

reviews    False
class      False
dtype: bool

 Lowercase all review data,  Removing all punctuations, Removing words with numbers,  Remove Tags and Html Tags, Tokenization, Removing Stopwords and  Parts of speech Tags

In [6]:
%%time

def data_cleaning(text):
    text = text.lower()

    words = text.split()
    table = str.maketrans("","",string.punctuation)
    stripped = [w.translate(table) for w in words]
    text = " ".join(stripped)

    text = re.sub('\w*\d\w*', "",text)

    cleanr = re.compile('<.*?>')
    text = re.sub(cleanr, '', text)

    tokenizer = nltk.tokenize.TreebankWordTokenizer()

    text = tokenizer.tokenize(text)

    stopwords = nltk.corpus.stopwords.words('english')

    text = [word for word in text if word not in stopwords]
    
    text = ' '.join(text)
    
    return text

Wall time: 0 ns


In [7]:
%%time

train_data['clean_reviews'] = train_data['reviews'].apply(data_cleaning)
test_data['clean_reviews'] = test_data['reviews'].apply(data_cleaning)

Wall time: 1min 9s


Check scores and Entropy

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()
from scipy.stats import entropy

In [9]:
%%time

def sentimentAnalysis(text):
    score = SentimentIntensityAnalyzer().polarity_scores(text)
    return(score)   

Wall time: 0 ns


In [10]:
%%time
    
def text_entropy(text):
    # we only consider UTF8 characters to compute the text entropy
    pk = [text.count(chr(i)) for i in range(256)]
    if sum(pk) == 0:
        text_entropy = None
    else:
        text_entropy = entropy(pk, base=2)
    return text_entropy 

Wall time: 0 ns


In [11]:
%%time

train_data['scores'] =  train_data['clean_reviews'].apply(sentimentAnalysis)
train_data =pd.concat([train_data.drop(['scores'], axis=1), train_data['scores'].apply(pd.Series)], axis=1)


test_data['scores'] =  test_data['clean_reviews'].apply(sentimentAnalysis)
test_data =pd.concat([test_data.drop(['scores'], axis=1), test_data['scores'].apply(pd.Series)], axis=1)

Wall time: 7min 24s


In [12]:
%%time

train_data['entropy'] =  train_data['clean_reviews'].apply(text_entropy)
test_data['entropy'] =  test_data['clean_reviews'].apply(text_entropy)

Wall time: 8.21 s


In [13]:
#Below done to re arrange the columns

# only run it when its the first time the model 

cols = list(train_data.columns)
cols2 = list(test_data.columns)
cols = [cols[0]]+[cols[2]]+[cols[3]]+[cols[4]]+[cols[5]]+[cols[6]]+[cols[7]]+[cols[1]]
cols2 = [cols2[0]]+[cols2[2]]+[cols2[3]]+[cols2[4]]+[cols2[5]]+[cols2[6]]+[cols2[7]]+[cols2[1]]
train_data = train_data[cols]
test_data = test_data[cols2]
del train_data['clean_reviews']
del train_data['reviews']
del test_data['clean_reviews']
del test_data['reviews']

In [14]:
test_data

Unnamed: 0,neg,neu,pos,compound,entropy,class
0,0.250,0.627,0.123,-0.9655,4.221264,1
1,0.207,0.633,0.160,-0.5143,4.115131,1
2,0.297,0.503,0.200,-0.9427,4.169845,1
3,0.095,0.688,0.217,0.8033,4.219201,1
4,0.190,0.628,0.182,-0.1027,4.150576,1
...,...,...,...,...,...,...
24995,0.028,0.583,0.390,0.9928,4.201828,10
24996,0.018,0.596,0.386,0.9896,4.132807,10
24997,0.044,0.682,0.274,0.9969,4.235828,10
24998,0.044,0.613,0.343,0.9718,4.149199,10


Filling up all the missing values in the procssed dataset

In [15]:
train_data['entropy'] = train_data['entropy'].fillna(0)
test_data['entropy'] = test_data['entropy'].fillna(0)

In [16]:
#When you need to save data to file uncomment the following line of code

# train_data.to_csv('./train_dataset/scoreAndPolarity_train_data.csv', header=True, index = False)
# test_data.to_csv('./test_dataset/scoreAndPolarity_test_data.csv', header=True, index = False)

In [17]:
train_data.isnull().any()

neg         False
neu         False
pos         False
compound    False
entropy     False
class       False
dtype: bool

Load Numeric data

In [18]:
df1 = pd.read_csv('./data/train_dataset/scoreAndPolarity_train_data.csv')
df2 = pd.read_csv('./data/test_dataset/scoreAndPolarity_test_data.csv')

## Loading and Implementing model

In [19]:
def printResult(y_pred, y_prob):
    acc = accuracy_score(df2["class"], y_pred)
    # Result
    print("Accuracy: {:.2f}".format(acc*100),end='\n\n')

In [20]:
train_features = df1.iloc[0:,0:5]
train_labels = df1.iloc[0:,-1]

In [21]:
test_features = df2.iloc[0:,0:5]
test_labels = df2.iloc[0:,-1]

In [22]:
model = RandomForestClassifier(n_estimators=500)
model.fit(train_features,train_labels)

RandomForestClassifier(n_estimators=500)

In [23]:
predict = model.predict(test_features)

Calculating mean sqaured error of the model computing the test label values to the test predicted values

In [24]:
mean_squared_error(test_labels,predict)

16.59252

In [25]:
dataset = pd.read_csv('./data/test_dataset/cleaned_test_data.csv')

Prediciton of a Review from the test dataset and spitting out a value from the model.

In [26]:
dataset.iloc[1000,0]

'I like to think of myself as a bad movie connoisseur. I like to think that the films most people label as the worst of all time I can easily withstand.<br /><br />But...there are exceptions. I can only recall three movies I have had the misfortune to see that I have repeatedly used the fast-forward button for large chunks of the story. Those movies are The Mighty Gorga, Night of the Seagulls, and this little crap, Deep Blood.<br /><br />In the world of Jaws ripoffs, this falls off the scale. Deep Blood doesn\'t have the realistic storyline of the original Crocodile, nor the incredible effects of The Sea Serpent, nor the commentary of Tintorera. No, instead we are treated to a handful of teens from any random failed \'80s public access sitcom battling bullies and the local sheriff.<br /><br />Shark attacks are realized by quick cuts of documentary footage with actors thrashing about in the water, occasionally with a bit of what appears to be orange-ish paint thrown into the water. Not 

In [45]:
test_labels.iloc[14526]

7

In [46]:
a= np.array(test_features.iloc[14526].values.tolist())
predict = model.predict(a.reshape(1, -1))

In [47]:
predict[0]

9

In [31]:
model.feature_importances_

array([0.18527441, 0.18066728, 0.18542106, 0.23268639, 0.21595087])