## Reviews Sentiment Analyzer
Xuan Chew

May 8, 2019

### Prep

In [32]:
# Configure Paths
# Movie Reviews
POS_FILE = 'Data/pos'
NEG_FILE = 'Data/neg'
NRC_FILE = 'Data/NRC.txt'

In [33]:
# Import Libraries
from os import listdir
import os

import string
from collections import Counter
import math
import random

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

import nltk

In [34]:
# NLTK Setup
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from warnings import filterwarnings
filterwarnings('ignore')
import sys
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xuanchew/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xuanchew/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Movie File Reading Helper Functions

In [35]:
# Input a file and read the text
def readFile(file):
    reader = open(file, 'r')
    inputText = reader.read()
    reader.close()
    
    return inputText

# Read and clean text as needed
def readAndClean(path, counter):
    for file in listdir(path):
        fullPath = path + '/' + file
        text = readFile(fullPath)
        improved = cleanText(text)
        counter.update(improved)
        
#Helper function that gets a filename, reads and process the text
def getTokens(filename):
    text = readFile(filename)
    tokens = cleanText(text)
    return tokens

def prepFiles(pathDict, polarity):
    result = []
    
    for filename in os.listdir(pathDict):
        if filename.endswith(".txt"):
            path = pathDict + "/" + filename
            tokens = getTokens(path)
            item = (tokens, polarity)
            result.append(item)
            
    return result

#### Text Cleaning Functions

In [36]:

# Remove stopwords
def removeStopWords(text):
    tokens = text.split()
    stopWords = set(stopwords.words('english'))
    return ([ word for word in tokens if word not in stopWords])

# Removing single character words
def removeSingleChar(text):
    tokens = text.split()
    return ([word for word in tokens if len(word) > 1])

# Removing punctuation
def removePunctuation(text):
    tokens = text.split()
    transTable = str.maketrans('','', string.punctuation)
    return ([word.translate(transTable) for word in tokens])

# Removing non alphabetic characters
def removeNonAlpha(text):
    tokens = text.split()
    return ([ word for word in tokens if word.isalpha()])

# Primary cleaning function
def cleanText(text):
    lemmatizer = WordNetLemmatizer()

    # Prepare tokens by splitting with text
    tokens = text.split()
    
    # Lemmatize
    for token in tokens:
        token = lemmatizer.lemmatize(token)
    
    # Cleanup by removing stopwords based on stop words corpora
    stopWords = set(stopwords.words('english'))
    tokens = [ word for word in tokens if word not in stopWords]
    
    # Cleanup by removing single char words
    tokens = [word for word in tokens if len(word) > 1]

    # Cleanup data by removing punctuation
    punctuation = string.punctuation
    transTable = str.maketrans('','',punctuation)
    tokens = [word.translate(transTable) for word in tokens]

    # Filter by only keeping alphabets
    tokens = [ word for word in tokens if word.isalpha()]

    return tokens

### Baseline : Basic Score Based Model
This baseline model utilizes the NRC EmoLex Lexicon in a very simple way that naively assumes that text with more positive words will most likely carry an overall positive sentiment and vice versa.

In [37]:
def emotionDataScore():
    scoreDict = {}
    
    #NRC Prep
    emoLines = open(NRC_FILE).readlines()
    strippedEmoLines = [x.strip() for x in emoLines]
    cleanedEmoLines = [x.replace('\t',',') for x in strippedEmoLines]

    splitList = []
    allToken = []
    scoreDict = {}


    for i in range(len(cleanedEmoLines)):
        splitList.append(cleanedEmoLines[i].split(','))

    for k in range(len(splitList)):
        if splitList[k][2] != "0":
            allToken.append(splitList[k])


    # Score system
    for j in allToken:
            key = j[0]
            if key not in scoreDict:
                value = [j[1]]
                scoreDict[key] = value
            else:
                value = j[1]
                scoreDict[key].append(value)
    for words in scoreDict.keys():
        score = 0
        for item in scoreDict[words]:
            if (item == "anger" or item == "disgust"):
                score -= 90
            if (item == "fear" or item == "sadness"):
                score -= 80
            if (item == "anticipation" or item == "surprise"):
                score += 25
            if (item == "joy"):
                score += 40
            if (item == "trust"):
                score += 15
            if (item == "negative"):
                score -= 100
            if (item == "positive"):
                score += 100
            scoreDict[words] = score

    return scoreDict
            
def baselineClassify(file, scoreDict):
    token = cleanText(readFile(file))
    scoreList = []
    
    for v in token:
        if v in scoreDict.keys():
            scoreList.append(scoreDict[v])  
            
    count = 0
    
    for num in scoreList:
        if num > 0 :
            count += 1
            
    if count / len(scoreList) > 0.5:
        return 1
    else:
        return 0
    
# Accuracy calculation
def baselineAcc(directory):
    acc = []
    scoreDict = emotionDataScore()
    
    for file in listdir(directory):
        path = directory + '/' + file
        acc.append(baselineClassify(path, scoreDict))
        
    accuracy = round(sum(acc)/len(acc),3)
    
    return(accuracy)

def runBaseline():
    print("Negative Review Baseline Accuracy: "+ str((baselineAcc(NEG_FILE))*100)+" %")
    print("Positive Review Baseline Accuracy: " + str((baselineAcc(POS_FILE)*100)) +" %")

#### Logistic Regression and Machine Learning Methods
The following function uses more elaborate and complex methods to compute the sentiment. Summary results will also be printed out.

In [38]:
# Locate features
def findFeatures(tokens):
    words = set(tokens)
    
    features = {}
    
    for word in words:
        features[word] = (word in words)
        
    return features

def get_precision(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(training_set)
    precision = (nltk.classify.accuracy(classifier, testing_set))*100
    return precision

def runTest(k, split):
    print("==============================================")
    print("Running for " +str(k) +" most common words.")
    print("----------------------------------------------")
    posCounter = Counter()
    negCounter = Counter()

    readAndClean(POS_FILE, posCounter)
    readAndClean(NEG_FILE, negCounter)

    allCounters = posCounter + negCounter
    mostCommon = allCounters.most_common(k)
    mostFrequentWords = []

    #Drop freq
    for i in range (0, len(mostCommon)):
        mostFrequentWords.append(mostCommon[i][0])

    positiveFiles = prepFiles(POS_FILE, 'pos')
    negativeFiles = prepFiles(NEG_FILE, 'neg')
    posNegFiles = positiveFiles + negativeFiles
    
    random.shuffle(posNegFiles)

    featureSet = [(findFeatures(tokens), polarity) for (tokens, polarity) in posNegFiles]
    trainingSet = featureSet[:split]
    testingSet = featureSet[split:]
    
    #Naive Bayes
    NaiveBayesClassifier = nltk.NaiveBayesClassifier.train(trainingSet)
    print("NaiveBayes Accuracy:", (nltk.classify.accuracy(NaiveBayesClassifier, testingSet)) * 100, '%')

    #Logistic Regression
    logisticRegressionClassifier = SklearnClassifier(LogisticRegression(solver = 'liblinear'))
    logisticRegressionClassifier.train(trainingSet)
    print("Logistic Regression(liblinear) Accuracy:", (nltk.classify.accuracy(logisticRegressionClassifier, testingSet)) * 100, '%')

    #Modifying Solver
    logisticRegressionClassifier = SklearnClassifier(LogisticRegression(solver = 'lbfgs'))
    logisticRegressionClassifier.train(trainingSet)
    print("Logistic Regression(lbfgs) Accuracy:", (nltk.classify.accuracy(logisticRegressionClassifier, testingSet)) * 100, '%')

    #Linear SVC
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(trainingSet)
    print("Linear SVC Accuracy:", (nltk.classify.accuracy(LinearSVC_classifier, testingSet))*100, '%')

    print("\nNB Classifier: ")
    NaiveBayesClassifier.show_most_informative_features(15)

def runAll():
    print("==== Sentiment Analysis For Movie Review Data ====")
    runBaseline()
    runTest(1000, 1600)
    runTest(3000, 1600)
    runTest(6000, 1600)
    runTest(10000, 1600)



### Results - Movie Data

#### Baseline Score Based Method


In [39]:
runBaseline()

Negative Review Baseline Accuracy: 79.7 %
Positive Review Baseline Accuracy: 92.10000000000001 %


#### Advanced Methods with Various Parameters

In [40]:
# Top 1000 Words, 80:20 Train-Test Split
runTest(1000, 1600)

Running for 1000 most common words.
----------------------------------------------
NaiveBayes Accuracy: 70.5 %
Logistic Regression(liblinear) Accuracy: 88.5 %
Logistic Regression(lbfgs) Accuracy: 87.5 %
Linear SVC Accuracy: 85.5 %

NB Classifier: 
Most Informative Features
               ludicrous = True              neg : pos    =     20.9 : 1.0
                   sucks = True              neg : pos    =     14.7 : 1.0
                   inept = True              neg : pos    =     13.8 : 1.0
                feelgood = True              pos : neg    =     12.0 : 1.0
               affecting = True              pos : neg    =     11.4 : 1.0
               atrocious = True              neg : pos    =     10.6 : 1.0
           unintentional = True              neg : pos    =      9.9 : 1.0
                religion = True              pos : neg    =      9.9 : 1.0
              chronicles = True              pos : neg    =      9.4 : 1.0
                  minnie = True              pos : 

In [41]:
# Top 3000 Words, 80:20 Train-Test Split
runTest(3000, 1600)

Running for 3000 most common words.
----------------------------------------------
NaiveBayes Accuracy: 75.0 %
Logistic Regression(liblinear) Accuracy: 85.25 %
Logistic Regression(lbfgs) Accuracy: 84.25 %
Linear SVC Accuracy: 84.0 %

NB Classifier: 
Most Informative Features
            breathtaking = True              pos : neg    =     13.5 : 1.0
                  debate = True              pos : neg    =     12.5 : 1.0
               maintains = True              pos : neg    =     11.8 : 1.0
             outstanding = True              pos : neg    =     11.6 : 1.0
               ludicrous = True              neg : pos    =     11.3 : 1.0
             fascination = True              pos : neg    =     11.1 : 1.0
                captures = True              pos : neg    =     11.0 : 1.0
                 miscast = True              neg : pos    =     10.9 : 1.0
                  avoids = True              pos : neg    =     10.4 : 1.0
                  doubts = True              pos 

In [42]:
# Top 6000 Words, 80:20 Train-Test Split
runTest(6000, 1600)

Running for 6000 most common words.
----------------------------------------------
NaiveBayes Accuracy: 68.0 %
Logistic Regression(liblinear) Accuracy: 90.25 %
Logistic Regression(lbfgs) Accuracy: 89.75 %
Linear SVC Accuracy: 89.0 %

NB Classifier: 
Most Informative Features
             outstanding = True              pos : neg    =     11.8 : 1.0
                chilling = True              pos : neg    =     10.7 : 1.0
             fascination = True              pos : neg    =     10.1 : 1.0
               performed = True              pos : neg    =     10.1 : 1.0
                    slip = True              pos : neg    =     10.1 : 1.0
               atrocious = True              neg : pos    =      9.9 : 1.0
                  seagal = True              neg : pos    =      9.9 : 1.0
              astounding = True              pos : neg    =      9.4 : 1.0
                  avoids = True              pos : neg    =      9.4 : 1.0
              chronicles = True              pos 

In [43]:
# Top 10000 Words, 80:20 Train-Test Split
runTest(10000, 1600)

Running for 10000 most common words.
----------------------------------------------
NaiveBayes Accuracy: 67.75 %
Logistic Regression(liblinear) Accuracy: 86.75 %
Logistic Regression(lbfgs) Accuracy: 87.25 %
Linear SVC Accuracy: 86.5 %

NB Classifier: 
Most Informative Features
                captures = True              pos : neg    =     17.3 : 1.0
            effortlessly = True              pos : neg    =     12.1 : 1.0
                 offbeat = True              pos : neg    =     12.1 : 1.0
                  avoids = True              pos : neg    =     11.4 : 1.0
               strongest = True              pos : neg    =     11.4 : 1.0
                depicted = True              pos : neg    =     10.8 : 1.0
               atrocious = True              neg : pos    =     10.6 : 1.0
                  seagal = True              neg : pos    =     10.6 : 1.0
             outstanding = True              pos : neg    =     10.3 : 1.0
              astounding = True              po