# **Upload Dataset here**

In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving small_formatted_dataset.csv to small_formatted_dataset.csv
User uploaded file "small_formatted_dataset.csv" with length 4355321 bytes


# **All the imports**

In [2]:
import re
import io
import nltk
from nltk import pos_tag, word_tokenize
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, svm
from sklearn.preprocessing import Binarizer

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import SparsePCA
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import warnings

import pandas as pd
from sklearn.model_selection import KFold
import random
from datetime import datetime


nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# **Discard URLs, Digits and Special Symbols in the reviews**

URLs, Digits and Special-Symbols add noise  to the data. Better get rid of them.

In [3]:
def clean_data(strList):
    URL_regex = re.compile('https?\S+')
    specialSym_regex = re.compile('[,\.@_!#$%^&*()"`<>?/\|}{~:]')
    digits_regex = re.compile('\d+')

    for i in range(len(strList)):
        strList.iloc[i] = re.sub(URL_regex, '', strList[i])
        strList.iloc[i] = re.sub(specialSym_regex, '', strList[i])
        strList.iloc[i] = re.sub(digits_regex, '', strList[i])

    return strList

# **Lemmatize the words in the Reviews**

This process tags the words with thier parts-of-speech. 
With the help of these tags, words are converted into their **root form**. 

1.   Plurals will be converted to singulars
2.   Verbs will be converted to root-form(i.e, verbs of the form **ran, running, run** will be converted to **run**)
3.   Converts all the words to lowercase to overcome redundancy



In [4]:
def lemmatize_with_pos_tag(strList):
    lemmatizer = nltk.stem.WordNetLemmatizer()

    for i in range(len(strList)):
        # tag the words with parts of speech to obtain root_word
        review = ""

        for word, tag in pos_tag(word_tokenize(strList[i])):
            wntag = tag[0].lower()
            wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
            if wntag is None:
                review += word.lower() + " "
            else:
                review += lemmatizer.lemmatize(word, wntag).lower() + " "

        strList.iloc[i] = review

    return strList

# **Cleaning and Simplyfying Reviews**

In [5]:
def refine_reviews(reviews_list):
    clean_data(reviews_list)
    lemmatize_with_pos_tag(reviews_list)

    return reviews_list

# **Calculate accuracy of given model**

In [6]:
def get_accuracy(classifier, train_X, train_Y, test_X, test_Y):
    classifier.fit(train_X, train_Y)
    predictions = classifier.predict(test_X)

    return metrics.accuracy_score(predictions, test_Y)*100

# **Do K-Fold testing to get average accuracy of the given model on the given dataset**

In [7]:
def get_avg_accuracy(model_name, classifier_obj, X, Y):
    # 'kfold' object is  useful in doing kfold testing
    kfold = KFold(n_splits=5, shuffle=True, random_state=random.randint(1, 2000000))

    trail_num = 0
    individual_test_accuracies = []

    print()
    for train_index, test_index in kfold.split(X):
        train_x = X[train_index]
        train_y = Y[train_index]
        test_x = X[test_index]
        test_y = Y[test_index]

        trail_num += 1

        accuracy_for_curr_test = get_accuracy(classifier_obj, train_x, train_y, test_x, test_y)
        print(model_name,":  KFold Test No = ", trail_num, ",   Accuracy = ", accuracy_for_curr_test)
        individual_test_accuracies.append(accuracy_for_curr_test)

    # calculate average accuracy from the accuracies of all the individual tests
    avg_accuracy = sum(individual_test_accuracies) / len(individual_test_accuracies)
    print("---------------------------------------------------------------------------------------")
    print(model_name+" Average accuracy = ", avg_accuracy)
    print("---------------------------------------------------------------------------------------\n")
    print()

    return avg_accuracy

# **Load Dataset**

In [8]:
# df = pd.read_csv(io.StringIO(uploaded['small_formatted_dataset.csv'].decode('utf-8')))
df = pd.read_csv('small_formatted_dataset.csv')
print("File loaded")

File loaded


# **Prepare the DataFrame**

In [9]:
df.drop(columns=['charLen'])
df['Review'] = refine_reviews(df['Review'].copy())
col_names = df.columns.to_list()
df.dropna(subset=col_names, inplace=True)  # drop the row which has 'nan' in one of its column

# **Extract Features From the Dataset by vectorising the Reviews**

In [10]:
# TfidfVectorizer is better than CountVectoriser, because TfidfVectorizer is inherently capable of removing words that are too rare or
# or are too frequent in the reviews. Such words don't contribute as much prediction

vectorizer = TfidfVectorizer(min_df=5 , max_df=0.3 , ngram_range=(1,3)) 
vectorizer.fit(df['Review'])

X = df['Review']
X = vectorizer.transform(X)
Y = df['Sentiment']

In [11]:
print(X.shape)

(19999, 35499)


# **Models used for Training**

## Model 1: Decision Tree

In [28]:
decision_tree = DecisionTreeClassifier(criterion="gini",max_depth=15 ,min_samples_split=30)
std_scaler = StandardScaler(with_mean=False)
scaled_X = std_scaler.fit_transform(X)

get_avg_accuracy("Decision Tree", decision_tree, scaled_X, Y)


Decision Tree :  KFold Test No =  1 ,   Accuracy =  75.85
Decision Tree :  KFold Test No =  2 ,   Accuracy =  75.775
Decision Tree :  KFold Test No =  3 ,   Accuracy =  77.125
Decision Tree :  KFold Test No =  4 ,   Accuracy =  76.225
Decision Tree :  KFold Test No =  5 ,   Accuracy =  76.31907976994249
---------------------------------------------------------------------------------------
Decision Tree Average accuracy =  76.2588159539885
---------------------------------------------------------------------------------------




76.2588159539885

## Model 2: Linear SVM

In [13]:
linear_SVM = svm.LinearSVC(max_iter=3000)
std_scaler = StandardScaler(with_mean=False)
scaled_X = std_scaler.fit_transform(X)

get_avg_accuracy("Linear SVM", linear_SVM, scaled_X, Y)


Linear SVM :  KFold Test No =  1 ,   Accuracy =  86.45
Linear SVM :  KFold Test No =  2 ,   Accuracy =  84.775
Linear SVM :  KFold Test No =  3 ,   Accuracy =  85.925
Linear SVM :  KFold Test No =  4 ,   Accuracy =  86.52499999999999
Linear SVM :  KFold Test No =  5 ,   Accuracy =  86.04651162790698
---------------------------------------------------------------------------------------
Linear SVM Average accuracy =  85.94430232558139
---------------------------------------------------------------------------------------




85.94430232558139

## Model 3: Logistic Regression

In [14]:
logistic_reg = LogisticRegression(C = 0.05,max_iter = 1000,tol = 0.0001,solver = 'sag',fit_intercept = True,penalty = 'l2',dual = False,verbose = 0)
std_scaler = StandardScaler(with_mean=False)
scaled_X = std_scaler.fit_transform(X)

get_avg_accuracy("Logicstic Reg..", logistic_reg, scaled_X, Y)


Logicstic Reg.. :  KFold Test No =  1 ,   Accuracy =  88.02499999999999
Logicstic Reg.. :  KFold Test No =  2 ,   Accuracy =  86.875
Logicstic Reg.. :  KFold Test No =  3 ,   Accuracy =  87.125
Logicstic Reg.. :  KFold Test No =  4 ,   Accuracy =  88.075
Logicstic Reg.. :  KFold Test No =  5 ,   Accuracy =  87.97199299824956
---------------------------------------------------------------------------------------
Logicstic Reg.. Average accuracy =  87.6143985996499
---------------------------------------------------------------------------------------




87.6143985996499

## Model 4: Naive Bayes

In [15]:
navie_bayes = naive_bayes.MultinomialNB()

#binarized_X = Binarizer().fit_transform(X)
#get_avg_accuracy("Naive Bayes", navie_bayes, binarized_X, Y)

get_avg_accuracy("Naive Bayes", navie_bayes, X, Y)


Naive Bayes :  KFold Test No =  1 ,   Accuracy =  90.45
Naive Bayes :  KFold Test No =  2 ,   Accuracy =  89.45
Naive Bayes :  KFold Test No =  3 ,   Accuracy =  90.05
Naive Bayes :  KFold Test No =  4 ,   Accuracy =  90.425
Naive Bayes :  KFold Test No =  5 ,   Accuracy =  89.472368092023
---------------------------------------------------------------------------------------
Naive Bayes Average accuracy =  89.96947361840459
---------------------------------------------------------------------------------------




89.96947361840459

## Model 5: Neural Network

In [16]:
neural_net = MLPClassifier(solver='adam', hidden_layer_sizes=(100,),early_stopping=True,max_iter=100,batch_size=2000,tol=0.001,n_iter_no_change=2,alpha=1000)

std_scaler = StandardScaler(with_mean=False)
scaled_X = std_scaler.fit_transform(X)
get_avg_accuracy("Neural Network", neural_net, scaled_X, Y)


Neural Network :  KFold Test No =  1 ,   Accuracy =  87.7
Neural Network :  KFold Test No =  2 ,   Accuracy =  87.47500000000001
Neural Network :  KFold Test No =  3 ,   Accuracy =  88.35
Neural Network :  KFold Test No =  4 ,   Accuracy =  87.925
Neural Network :  KFold Test No =  5 ,   Accuracy =  88.69717429357338
---------------------------------------------------------------------------------------
Neural Network Average accuracy =  88.02943485871467
---------------------------------------------------------------------------------------




88.02943485871467

# **Manual Testing of the Models**

## Decision Tree

In [None]:
decision_tree = DecisionTreeClassifier(criterion="gini",max_depth=15 ,min_samples_split=30)
std_scaler = StandardScaler(with_mean=False)
scaled_X = std_scaler.fit_transform(X)

# Training Happening here
print("Training has started ", datetime.time(datetime.now()))
decision_tree.fit(scaled_X, Y)
print("Training Successful ", datetime.time(datetime.now()))

print("\n Model is ready for testing. Enter your reivew to check the sentiment")
test_review =  input("Enter Review : ")

test_review =  pd.DataFrame([test_review], columns=["Review"])
test_review["Review"]  = refine_reviews(test_review["Review"])

test_review_vec = vectorizer.transform(test_review["Review"])

models_guess = decision_tree.predict(test_review_vec)
sentiment = "Positive Sentiment" if models_guess[0] == 1 else "Negative Sentiment"
print("\n",sentiment)


## Linear SVM

In [None]:
linear_SVM = svm.LinearSVC(max_iter=3000)
std_scaler = StandardScaler(with_mean=False)
scaled_X = std_scaler.fit_transform(X)


# Training Happening here
print("Training has started ", datetime.time(datetime.now()))
linear_SVM.fit(scaled_X, Y)
print("Training Successful ", datetime.time(datetime.now()))

print("\n Model is ready for testing. Enter your reivew to check the sentiment")
test_review =  input("Enter Review : ")

test_review =  pd.DataFrame([test_review], columns=["Review"])
test_review["Review"]  = refine_reviews(test_review["Review"])

test_review_vec = vectorizer.transform(test_review["Review"])
models_guess = linear_SVM.predict(test_review_vec)
sentiment = "Positive Sentiment" if models_guess[0] == 1 else "Negative Sentiment"
print("\n",sentiment)

## Navie Bayes

In [27]:
nb =  naive_bayes.MultinomialNB()

binarized_X = Binarizer().fit_transform(X)


# Training Happening here
print("Training has started ", datetime.time(datetime.now()))
nb.fit(binarized_X, Y)
print("Training Successful ", datetime.time(datetime.now()))

print("\n Model is ready for testing. Enter your reivew to check the sentiment")
test_review =  input("Enter Review : ")

test_review =  pd.DataFrame([test_review], columns=["Review"])
test_review["Review"]  = refine_reviews(test_review["Review"])

test_review_vec = vectorizer.transform(test_review["Review"])
models_guess = nb.predict(test_review_vec)
sentiment = "Positive Sentiment" if models_guess[0] == 1 else "Negative Sentiment"
print("\n",sentiment)

Training has started  04:14:01.031621
Training Successful  04:14:01.047991

 Model is ready for testing. Enter your reivew to check the sentiment
Enter Review : When I start after shutdown then mouse doesn't work and while updating OS also it failed after 99 % every time. Processing speed is too slow. They mentioned it has i5 processor but it doesn't work like that. Webcam quality is too bad.

 Negative Sentiment


## Neural Network

In [None]:
neural_net = MLPClassifier(solver='adam', hidden_layer_sizes=(100,),early_stopping=True,max_iter=100,batch_size=2000,tol=0.001,n_iter_no_change=2,alpha=1000)

std_scaler = StandardScaler(with_mean=False)
scaled_X = std_scaler.fit_transform(X)

# Training Happening here
print("Training has started ", datetime.time(datetime.now()))
neural_net.fit(scaled_X, Y)
print("Training Successful ", datetime.time(datetime.now()))

print("\n Model is ready for testing. Enter your reivew to check the sentiment")
test_review =  input("Enter Review : ")

test_review =  pd.DataFrame([test_review], columns=["Review"])
test_review["Review"]  = refine_reviews(test_review["Review"])

test_review_vec = vectorizer.transform(test_review["Review"])

models_guess = neural_net.predict(test_review_vec)
sentiment = "Positive Sentiment" if models_guess[0] == 1 else "Negative Sentiment"
print("\n",sentiment)