# Improved Approaches based on a total of 160K reviews:
*   Training Data: 120K reviews
*   Testing Data: 200 reviews



---



# Packages & Modules

In [1]:
# # For installation via Anaconda: preferred using "conda".
# # For installation via Google Colab or packages not available in conda: using "pip".

# # Install the Pandas library (Python Data Analysis Library) via Pip:
# # The exclamation mark ! means run it as a shell command rather than a notebook command.
# # Google Colab wants an exclamation point before most commands.
# 
# # Install the Pandas library for data manipulation and analysis:
# # Use Pandas especially for manipulating numerical tables and time series.
# !pip install pandas
# 
# # Install the Numpy library for scientific & mathematical operation:
# # Numpy allows to work with multi-dimensional arrays & matrices.
# !pip install numpy
# 
# # Install the Beautiful Soup library for scraping data from HTML and XML files:
# !pip install beautifulsoup4
# 
# # Install the NLTK library for NLP:
# !pip install nltk
# # scipy needs to be installed prior to gensim installation.
# !pip3 install scipy
# !pip install gensim
# 
# # Install the Scikit-learn for supervise & unsupervise learning algorithms:
# !pip install scikit-learn
# 
# # Install the Matplotlib library for visualization:
# !pip install matplotlib
# # Install the Seaborn library for pretty plot:
# !pip install seaborn

# 
# Install the Vader Lexicon for Sentiment Analysis:
!pip install vaderSentiment



## Import necessary libraries and modules:

In [2]:
# Import libraries for data preprocessing:

from pandas import DataFrame
import pandas as pd
import numpy as np


# Import NLTK modules:
import nltk

# Download the NLTK Punkt Sentence Tokenizer that divides a text into a list of sentences:
nltk.download('punkt')
# Import tokenizer for word and sentence:
from nltk.tokenize import word_tokenize, sent_tokenize

# Download the NLTK stopword corpus:
nltk.download('stopwords')
# Import the Stopwords module:
from nltk.corpus import stopwords

# Import PorterStemmer for stemming words:
from nltk.stem.porter import PorterStemmer

# Download the NLTK wordnet corpus:
nltk.download('wordnet')
# Import WordNetLemmatizer for lemmatization:
from nltk.stem.wordnet import WordNetLemmatizer


# Download the NLTK vader_lexicon:
nltk.download('vader_lexicon')
# Import SentimentIntensityAnalyzer from NLTK Vader lexicon:
from nltk.sentiment.vader import SentimentIntensityAnalyzer


# Import CountVectorizer from Sciket Learn for converting a collection of text documents to a matrix of token counts:
from sklearn.feature_extraction.text import CountVectorizer

# Import train_test_split from Sciket Learn for splitting the dataset:
from sklearn.model_selection import train_test_split

# Import Gaussian Naive Bayes model from Sciket Learn for classification:
from sklearn.naive_bayes import GaussianNB

# Import RandomForestClassifier from Sciket Learn for classification:
from sklearn.ensemble import RandomForestClassifier

# Import confusion_matrix from Scikit Learn for accuracy evaluation:
from sklearn.metrics import confusion_matrix

# Import accuracy_score from Scikit Learn for computing the prediction accuracy:
from sklearn.metrics import accuracy_score

# Import TfidfVectorizer from Scikit Learn for converting a collection of raw documents to a matrix of TF-IDF features (in percentage):
from sklearn.feature_extraction.text import TfidfVectorizer

# Important Kmeans from Scikit Learn for K-means Clustering:
from sklearn.cluster import KMeans

# Import LinearRegression, LogisticRegression from Scikit Learn for Regression modeling:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics

# Import SVM from Sciket Learn for classification:
from sklearn import svm


# Import the Python built-in RegEx (Regular Expression) module for text cleaning & text search:
import re


# Import Matplotlib modules:
import matplotlib.pyplot as plt

# Set the output of plotting commands to be displayed inline in the notebook document (E.g., Jupyter, Colob):
%matplotlib inline

# # Set the interactive plots embedded within the notebook that allow zoom and resize:
# %matplotlib notebook

# Import the Seaborn library for pretty statiscal visualization:
import seaborn as sns

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Louise\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Louise\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Louise\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Louise\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
import pandas as pd
import numpy as np
import csv
import sys
import nltk
from nltk.corpus import stopwords
import glob
import os
import string

# Data Preprocessing

## Preprocess the training data:

In [4]:
# Load dataset from a file:
text_file = open(r"train1.ft.txt", "r", encoding="utf8")

# Split texts in the file into lines:
lines = text_file.read().split('\n')

In [5]:
print(len(lines))
print(type(lines))

3600001
<class 'list'>


In [6]:
# Get the last element in the list "lines":
lines[-1]

''

In [7]:
# Remove the last element in the list with pop() by default or pop(-1) and return the element that is removed:
lines.pop(-1)

''

In [8]:
lines[-1]

"__label__2 Makes My Blood Run Red-White-And-Blue: I agree that every American should read this book -- and everybody else for that matter. I don't agree that it's scholarly. Rather, it's a joy to read -- easy to understand even for a person with two master's degrees! Between McElroy's chapter on How American Culture was Formed and Ken Burns' Lewis & Clark, I don't know which makes my blood run red-white-and-bluer. And as a child of the anti-establishment `60s, it's done a lot toward helping me understand why we Americans do what we do. It's the best history book I've ever read, the best history course I've ever taken or taught. I'm buying it for my home library for my grandchildren to use as a resource. We're also using it as a resource for a book on urban planning."

In [9]:
len(lines)

3600000

-------------------------------------------------------------------------

In [10]:
# Define the function for replacing label strings with integer 0 or 1:
def replace_label(text):
    labels = []
    
    for item in text:
        first_ten_chars = item[:10]
        if first_ten_chars == '__label__1':
            labels.append(int(0))   # 0 for Negative Review: '__label__1'
        elif first_ten_chars == '__label__2':
            labels.append(int(1))   # 1 for Positive Review: '__label__2'
            
    return labels

labels = replace_label(lines)

In [11]:
print(len(labels))
print(labels[0])
print(labels[-1])
print(type(labels))

3600000
1
1
<class 'list'>


In [12]:
# Define the function for removing lable strings in lines:
def remove_label(s):
    return s[11:]   # The text review starts from index 11 to the last index.

lines = [remove_label(s) for s in lines]

In [13]:
print(len(lines))
print()
print(lines[0])

3600000

Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^


In [14]:
dataTrain = pd.DataFrame()
dataTrain['text'] = lines
dataTrain['label'] = labels

dataTrain = dataTrain.sample(n = 120000, random_state = 123)
print(len(dataTrain))
dataTrain.head()

120000


Unnamed: 0,text,label
2725661,this movie sucks: This movie supposedly about ...,0
1798719,Good Entertainment: This program a well edited...,0
1242154,Does the job: This hamper does the job in my k...,1
3373098,"Buffett Mails it In: Being a huge Buffett fan,...",0
1663895,Sharp as a razor... almost.: Wow! My replaceme...,1


### Main function for cleaning texts:

In [15]:
stop = stopwords.words('english')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

# Define the function for lemmatizing texts:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

# Define the function for changing to lowercase, removing punctuation and stopwords:  
def clean_text(text):
    text = text.str.lower()
    text = text.str.replace('-', ' ')
    text = text.str.split(' ')
    text = text.apply(lambda x: [item for item in x if item not in stop])
    text = text.apply(', '.join)
    text = text.str.replace('[{}]'.format(string.punctuation), '')
    text = text.apply(lemmatize_text)
    text = text.apply(', '.join)
    text = text.str.replace('[{}]'.format(string.punctuation), '')
    text = text.str.replace('\\', ' ')
    return text

In [16]:
dataTrain['text'] = clean_text(dataTrain['text'])

In [17]:
print("Training Data: 120K Reviews")
dataTrain.head()

Training Data: 120K Reviews


Unnamed: 0,text,label
2725661,movie suck movie supposedly michael suck crapp...,0
1798719,good entertainment program well edited visuall...,0
1242154,job hamper job kid room hold two three load de...,1
3373098,buffett mail in huge buffett fan bought this u...,0
1663895,sharp razor almost wow replacement sharp cut s...,1


## Preprocess the testing data:

In [18]:
# Load dataset from a file:
text_file = open(r"test1.ft.txt", "r", encoding="utf8")

# Split texts in the file into lines:
lines = text_file.read().split('\n')

In [19]:
print(len(lines))
print(type(lines))

400001
<class 'list'>


In [20]:
# Get the last element in the list "lines":
lines[-1]

''

In [21]:
# Remove the last element in the list with pop() by default or pop(-1) and return the element that is removed:
lines.pop(-1)

''

In [22]:
lines[-1]

"__label__1 Comedy Scene, and Not Heard: This DVD will be a disappointment if you get it hoping to see some substantial portion of the acts of the various comics listed on the cover. All you get here are snippets of performance, at best. The rest is just loose-leaf reminiscence about the good old days in Boston, in the early 80's, when a lot of comics were hanging out together and getting their start.It's like a frat house reunion. There's a lot of lame nostalgia. There are quite a few guffaws recalling jokes (practical and otherwise)perpetrated - back then. But you had to have been there to appreciate all the basically good ol' boy camaraderie. If you weren't actually a part of that scene, all this joshing and jostling will fall flat.If you want to actually hear some of these comics' routines - you will have to look elsewhere."

In [23]:
len(lines)

400000

-------------------------------------------------------------------------

In [24]:
# # Define the function for replacing label strings with integer 0 or 1:
# def replace_label(text):
#     labels = []
    
#     for item in text:
#         first_ten_chars = item[:10]
#         if first_ten_chars == '__label__1':
#             labels.append(int(0))   # 0 for Negative Review: '__label__1'
#         elif first_ten_chars == '__label__2':
#             labels.append(int(1))   # 1 for Positive Review: '__label__2'
            
#     return labels

labels = replace_label(lines)

In [25]:
print(len(labels))
print(labels[0])
print(labels[-1])
print(type(labels))

400000
1
0
<class 'list'>


In [26]:
# Define the function for removing lable strings in lines:
def remove_label(s):
    return s[11:]   # The text review starts from index 11 to the last index.

lines = [remove_label(s) for s in lines]

In [27]:
print(len(lines))
print()
print(lines[0])

400000

Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"


In [28]:
dataTest = pd.DataFrame()
dataTest['text'] = lines
dataTest['label'] = labels

dataTest = dataTest.sample(n = 40000, random_state = 456)
print(len(dataTest))
dataTest.head()

40000


Unnamed: 0,text,label
333305,Confused: I have been a science fiction/fantas...,0
27936,What a SORRY A$$ way to go out!: Since this is...,0
17999,"If I had my way, I'd have all of you shot: I l...",1
124332,Super Fun for My Super Heroes!: You cannot eve...,1
303110,Extremely Poor Quality: This bit set is absolu...,0


In [29]:
# stop = stopwords.words('english')
# w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
# lemmatizer = nltk.stem.WordNetLemmatizer()

# # Define the function for lemmatizing texts:
# def lemmatize_text(text):
#     return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

# # Define the function for changing to lowercase, removing punctuation and stopwords:  
# def clean_text(text):
#     text = text.str.lower()
#     text = text.str.replace('-', ' ')
#     text = text.str.split(' ')
#     text = text.apply(lambda x: [item for item in x if item not in stop])
#     text = text.apply(', '.join)
#     text = text.str.replace('[{}]'.format(string.punctuation), '')
#     text = text.apply(lemmatize_text)
#     text = text.apply(', '.join)
#     text = text.str.replace('[{}]'.format(string.punctuation), '')
#     text = text.str.replace('\\', ' ')
#     return text

In [30]:
dataTest['text'] = clean_text(dataTest['text'])

In [31]:
print("Training Data: 40K Reviews")
dataTest.head()

Training Data: 40K Reviews


Unnamed: 0,text,label
333305,confused science fictionfantasy book fan long ...,0
27936,sorry a way go out since supposedly master p l...,0
17999,way id shot love the wall movie album story cl...,1
124332,super fun super hero cannot even imagine excit...,1
303110,extremely poor quality bit set absolutely awfu...,0


# Sentiment Analysis via Lexicon: VADER

In [32]:
# Import SentimentIntensityAnalyzer from NLTK Vader lexicon:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [33]:
# Define the Vader sentiment analyzer:
vaderAnalyzer = SentimentIntensityAnalyzer()

# Define the function of review rate:
def sentiment_analyzer_scores(text):
  score = vaderAnalyzer.polarity_scores(text)
  print(text)
  print(score)
  print()

  print("Overall, the review is rated as: ")

  if score['compound'] <= -0.05:
    print("Negative (Label: 0)")

  elif score['compound'] >= 0.05:
    print("Positive (Label: 1)")

  else:
    print("Neutral (No Label)")

  print()
  print("-------------------------------------")
  print()
  

In [34]:
text1 = "Lmao, it's so cute~"
text2 = "im very disappointed"
text3 = "The price is good but the quality is trash!"

sentiment_analyzer_scores(text1)
sentiment_analyzer_scores(text2)
sentiment_analyzer_scores(text3)


Lmao, it's so cute~
{'neg': 0.0, 'neu': 0.435, 'pos': 0.565, 'compound': 0.5994}

Overall, the review is rated as: 
Positive (Label: 1)

-------------------------------------

im very disappointed
{'neg': 0.629, 'neu': 0.371, 'pos': 0.0, 'compound': -0.5256}

Overall, the review is rated as: 
Negative (Label: 0)

-------------------------------------

The price is good but the quality is trash!
{'neg': 0.0, 'neu': 0.781, 'pos': 0.219, 'compound': 0.3054}

Overall, the review is rated as: 
Positive (Label: 1)

-------------------------------------



In [35]:
# print("=> Original Review:")
# sentiment_analyzer_scores(dataTrain.iloc[500, 0])
# print("Actual Label: ", dataTrain.iloc[500, 1])
# print(df1.iloc[0, 1])
# print("=> Sentiment Analysis of the Original Review:")
# print(vaderAnalyzer.polarity_scores(df1.iloc[0, 1]))
# print()

# print("=> Cleaned Text of the review:")
# sentiment_analyzer_scores(reviewCorpus[500])
# # print(reviewCorpus[0])
# # print("=> Sentiment Analysis of the Cleaned Text:")
# # print(vaderAnalyzer.polarity_scores(reviewCorpus[0]))

# print("Actual Label: ",labelArray[500])


## Create the function for VADER analysis:

In [36]:
vaderAnalyzer = SentimentIntensityAnalyzer()

# Define the function for analyzing sentiment using Vader Lexicon:
def analyze_sentiment_label(text):
    sentimentAnalysisResult = []

    for i in range(0, len(text)):
        score = vaderAnalyzer.polarity_scores(text[i])
        
        if score['compound'] <= -0.05:
            sentimentAnalysisResult.append(int(0))   # Negative Reviews
        elif score['compound'] >= 0.05:
            sentimentAnalysisResult.append(int(1))   # Positive Reviews
        else:
            sentimentAnalysisResult.append(int(2))   # Neutral Reviews (Do not exist in the original dataset)
            
    return sentimentAnalysisResult

### VADER sentiment analysis on the training data:

In [37]:
len(dataTrain)

120000

In [38]:
# Covert the panda.series to numpy array datatype with ".values":
reviewTrain = dataTrain['text'].values

print(len(reviewTrain))
print(type(reviewTrain))

120000
<class 'numpy.ndarray'>


In [39]:
# Get Vader analysis results:
vaderLabels_train = analyze_sentiment_label(reviewTrain)

In [40]:
print(type(vaderLabels_train))

<class 'list'>


In [42]:
print("Confusion Matrix:")
print(confusion_matrix(dataTrain['label'].tolist(), vaderLabels_train))

Confusion Matrix:
[[23596 34030  2483]
 [ 3238 55846   807]
 [    0     0     0]]


In [43]:
print("Accuracy on Training Data: " + str(accuracy_score(dataTrain['label'].tolist(), vaderLabels_train) * 100) + "%")

Accuracy on Training Data: 66.20166666666667%


### VADER sentiment analysis on the testing data:

In [44]:
len(dataTest)

40000

In [45]:
# Covert the panda.series to numpy array datatype with ".values":
reviewTest = dataTest['text'].values

print(len(reviewTest))
print(type(reviewTest))

40000
<class 'numpy.ndarray'>


In [46]:
# Get Vader analysis results:
vaderLabels_test = analyze_sentiment_label(reviewTest)

In [47]:
print(type(vaderLabels_test))

<class 'list'>


In [48]:
print("Confusion Matrix:")
print(confusion_matrix(dataTest['label'].tolist(), vaderLabels_test))

Confusion Matrix:
[[ 7911 11438   771]
 [ 1108 18505   267]
 [    0     0     0]]


In [49]:
print("Accuracy on Testing Data: " + str(accuracy_score(dataTest['label'].tolist(), vaderLabels_test) * 100) + "%")

Accuracy on Testing Data: 66.03999999999999%
