# Required Packages

In [1]:
import pandas as pd
from pandas import DataFrame
import re
import nltk
import string
import pickle
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import mean_squared_error

In [None]:
train_data = pd.read_csv('train_dataset/final_train_dataset.csv')
test_data = pd.read_csv('test_dataset/final_test_dataset.csv')

5 rows of train dataset

# Clean DataSet

Lower the column names

In [None]:
train_data.columns = train_data.columns.str.lower()
test_data.columns = test_data.columns.str.lower()

In [None]:
%%time


def cleanText(text):
    
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'ve", " have", text)
    text = re.sub(r"'ll", " will", text)
    text = re.sub(r"'re", " are", text)

    text = re.sub(r"[0-9]+", ' ', text)
    text = re.sub(r"-", ' ', text)
    
    
    text = text.strip().lower()
    

    default_stop_words = set(stopwords.words('english'))
    default_stop_words.difference_update({'no', 'not', 'nor', 'too', 'any'})
    stop_words = default_stop_words.union({"'m", "n't", "'d", "'re", "'s",
                                           'would','must',"'ve","'ll",'may'})

    word_list = word_tokenize(text)
    filtered_list = [w for w in word_list if not w in stop_words]
    text = ' '.join(filtered_list)
    
    text = re.sub(r"'", ' ', text)
    
   
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((i, " ") for i in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    

    text = ' '.join([w for w in text.split() if len(w)>1])

    # Replace multiple space with one space
    text = re.sub(' +', ' ', text)
    
    text = ''.join(text)

    return text

In [None]:
%%time

def NormalizeWithPOS(text):
    # Lemmatization & Stemming according to POS tagging

    word_list = word_tokenize(text)
    rev = []
    lemmatizer = WordNetLemmatizer() 
    stemmer = PorterStemmer() 
    for word, tag in pos_tag(word_list):
        if tag.startswith('J'):
            w = lemmatizer.lemmatize(word, pos='a')
        elif tag.startswith('V'):
            w = lemmatizer.lemmatize(word, pos='v')
        elif tag.startswith('N'):
            w = lemmatizer.lemmatize(word, pos='n')
        elif tag.startswith('R'):
            w = lemmatizer.lemmatize(word, pos='r')
        else:
            w = word
        w = stemmer.stem(w)
        rev.append(w)
    review = ' '.join(rev)
    return review

In [None]:
%%time
train_data['clean_reviews'] = train_data['reviews'].apply(cleanText)
test_data['clean_reviews'] = test_data['reviews'].apply(cleanText)
train_data['clean_reviews_normalized'] = train_data['clean_reviews'].apply(NormalizeWithPOS)
test_data['clean_reviews_normalized'] = test_data['clean_reviews'].apply(NormalizeWithPOS)

In [None]:
with open('./pickle_data/train_data_preLowFreq.pkl', 'wb') as pickle_file:
    pickle.dump(train_data, pickle_file)
with open('./pickle_data/test_data_preLowFreq.pkl', 'wb') as pickle_file:
    pickle.dump(test_data, pickle_file)

Loading data from pickled data for Low freqency word removal

In [2]:
with open('./pickle_data/train_data_preLowFreq.pkl', 'rb') as pickle_file:
    train_data = pickle.load(pickle_file)
with open('./pickle_data/test_data_preLowFreq.pkl', 'rb') as pickle_file:
    test_data = pickle.load(pickle_file)

### Low Frequency Words of Train Data for BOW 

In [3]:
freq_train1 = pd.Series(' '.join(train_data['clean_reviews_normalized']).split()).value_counts()
less_five_freq_train1 = freq_train1[(freq_train1 <5)]
print(less_five_freq_train1)

wopat       4
gorier      4
briberi     4
ashleigh    4
crossbre    4
           ..
chancho     1
unredem     1
helix       1
alto        1
ãlvaro     1
Length: 31145, dtype: int64


In [4]:
freq_train2 = pd.Series(' '.join(train_data['clean_reviews']).split()).value_counts()
less_five_freq_train2 = freq_train2[(freq_train2 <5)]
print(less_five_freq_train2)

wheezing        4
salina          4
relished        4
unrecognized    4
lemoine         4
               ..
callousness     1
tarkosvky       1
numerical       1
brennanâ…       1
herâ…but        1
Length: 45829, dtype: int64


### Low Frequency Words of Test Data for BOW¶

In [5]:
freq_test3 = pd.Series(' '.join(test_data['clean_reviews_normalized']).split()).value_counts()
less_five_freq_test3 = freq_test3[(freq_test3 <5)]
print(less_five_freq_test3)

bavaria       4
roxann        4
firehous      4
elmo          4
malaya        4
             ..
emtpi         1
jewess        1
lowerclass    1
beligium      1
dogey         1
Length: 30688, dtype: int64


In [6]:
freq_test4 = pd.Series(' '.join(test_data['clean_reviews']).split()).value_counts()
less_five_freq_test4 = freq_test4[(freq_test4 <5)]
print(less_five_freq_test4)

senility     4
rosenthal    4
wack         4
unfazed      4
tarkowsky    4
            ..
achenbach    1
ugghhhh      1
mussing      1
iistening    1
envies       1
Length: 45428, dtype: int64


### Remove words with frequency less than 5 

In [7]:
%%time

train_data['clean_reviews_normalized'] = train_data['clean_reviews_normalized'].apply(lambda x: ' '.join(x for x in x.split() if x not in less_five_freq_train1))
test_data['clean_reviews_normalized'] = test_data['clean_reviews_normalized'].apply(lambda x: ' '.join(x for x in x.split() if x not in less_five_freq_test3))


train_data['clean_reviews'] = train_data['clean_reviews'].apply(lambda x: ' '.join(x for x in x.split() if x not in less_five_freq_train2))
test_data['clean_reviews'] = test_data['clean_reviews'].apply(lambda x: ' '.join(x for x in x.split() if x not in less_five_freq_test4))

Wall time: 10.1 s


In [8]:
with open('./pickle_data/train_data_final.pkl', 'wb') as pickle_file:
    pickle.dump(train_data, pickle_file)
with open('./pickle_data/test_data_final.pkl', 'wb') as pickle_file:
    pickle.dump(test_data, pickle_file)

### A visual comparison of different cleaned data

In [None]:
print("A review example of dataset before cleaning:", end="\n\n")
print(train_data.iloc[0]['reviews'], end='\n\n')

print("clean_text:")
print(train_data.iloc[0]['clean_reviews'], end="\n\n")

print("clean_text_normalized:")
print(train_data.iloc[0]['clean_reviews_normalized'], end="\n\n")

# Create BOW

In [None]:
vectorizer = CountVectorizer(stop_words='english')

In [None]:
%%time

training_features_normalized = vectorizer.fit_transform(train_data['clean_reviews_normalized'])
testing_features_normalized = vectorizer.transform(test_data['clean_reviews_normalized'])

training_features = vectorizer.fit_transform(train_data['clean_reviews'])
testing_features = vectorizer.transform(test_data['clean_reviews'])

Pickling count vectorized data for further use.

In [None]:
# with open('./pickle_data/training_features_normalized.pkl', 'wb') as pickle_file:
#     pickle.dump(training_features_normalized, pickle_file)
# with open('./pickle_data/testing_features_normalized.pkl', 'wb') as pickle_file:
#     pickle.dump(testing_features_normalized, pickle_file)
# with open('./pickle_data/training_features.pkl', 'wb') as pickle_file:
#     pickle.dump(training_features, pickle_file)
# with open('./pickle_data/testing_features.pkl', 'wb') as pickle_file:
#     pickle.dump(testing_features, pickle_file)

In [None]:
# with open('./pickle_data/training_features_normalized.pkl', 'rb') as pickle_file:
#     training_features_normalized = pickle.load(pickle_file)
# with open('./pickle_data/testing_features_normalized.pkl', 'rb') as pickle_file:
#     testing_features_normalized = pickle.load(pickle_file)
# with open('./pickle_data/training_features.pkl', 'rb') as pickle_file:
#     training_features = pickle.load(pickle_file)
# with open('./pickle_data/testing_features.pkl', 'rb') as pickle_file:
#     testing_features = pickle.load(pickle_file)

In [None]:
print(training_features.shape)

In [None]:
print(testing_features.shape)

In [None]:
print(training_features_normalized.shape)

In [None]:
print(testing_features_normalized.shape)

In [None]:
def printResult(y_pred, y_prob):
    acc = accuracy_score(test_data["class"], y_pred)
    # Result
    print("Accuracy: {:.2f}".format(acc*100),end='\n\n')

# RandomForest Model 

In [None]:
# with open('./pickle_data/train_data.pkl', 'rb') as pickle_file:
#     train_data = pickle.load(pickle_file)
# with open('./pickle_data/test_data.pkl', 'rb') as pickle_file:
#     test_data = pickle.load(pickle_file)

In [None]:
# freq_train = pd.Series(' '.join(train_data['clean_reviews_normalized']).split()).value_counts()
# less_five_freq_train = freq_train[(freq_train <5)]
# print(less_five_freq_train)

In [None]:
# freq_test = pd.Series(' '.join(test_data['clean_reviews_normalized']).split()).value_counts()
# less_five_freq_test = freq_test[(freq_test <5)]
# print(less_five_freq_test)

In [None]:
%%time

# train_data['clean_reviews_normalized'] = train_data['clean_reviews_normalized'].apply(lambda x: ' '.join(x for x in x.split() if x not in less_five_freq_train))
# test_data['clean_reviews_normalized'] = test_data['clean_reviews_normalized'].apply(lambda x: ' '.join(x for x in x.split() if x not in less_five_freq_test))

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=0)
model_normalized = RandomForestClassifier(n_estimators=100, random_state=0)

In [None]:
%%time 
model.fit(training_features, train_data["class"])
model_normalized.fit(training_features_normalized, train_data["class"])

In [None]:
predict_normalized = model_normalized.predict(testing_features_normalized)

In [None]:
predict = model.predict(testing_features)

In [None]:
predict_normalized_1 = model_normalized.predict(testing_features_normalized[999])
predict_1 = model.predict(testing_features[999])

In [None]:
predict_normalized_1[0]

In [None]:
predict_1[0]

In [None]:
predict_prob = model_normalized.predict_proba(testing_features_normalized)[:,1]

In [None]:
printResult(predict, predict_prob)

In [None]:
mean_squared_error(test_data['class'],predict)

In [None]:
mean_squared_error(test_data['class'],predict_normalized)

## User Input and modifications

Take in user input and convert it to a list followed that by converting it to a data frame.

In [None]:
Input = input('Please enter a review: ')

In [None]:
Input = [Input]
input_df = DataFrame(Input,columns=['reviews'])
input_df['clean_reviews'] = input_df['reviews'].apply(cleanText)
input_df['clean_reviews_normalized'] = input_df['clean_reviews'].apply(NormalizeWithPOS)

In [None]:
input_df

In [None]:
%%time

# training_features_normalized = vectorizer.fit_transform(train_data['clean_reviews_normalized'])
# testing_features_normalized = vectorizer.transform(test_data['clean_reviews_normalized'])


# training_features = vectorizer.fit_transform(train_data['clean_reviews'])
# testing_features = vectorizer.transform(test_data['clean_reviews'])

In [None]:
%%time

input_testing_features_normal = vectorizer.transform(input_df['clean_reviews_normalized'])


In [None]:
input_testing_features = vectorizer.transform(input_df['clean_reviews'])

In [None]:
print(input_testing_features_normal.shape)

In [None]:
print(input_testing_features.shape)

In [None]:
predict_normal = model.predict(input_testing_features_normal)
predict = model.predict(input_testing_features)

In [None]:
predict[0]

In [None]:
predict_normal[0]