In [188]:
# Constant variables definitions
DIR_PATH = "sample_texts/Fullmetal Alchemist: Brotherhood/"
NUM_FILES = 8

In [2]:
import os

In [3]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np

In [4]:
import nltk
import nltk.data
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [5]:
import re

In [190]:
from sklearn import svm

In [27]:
# split_into_sentences()
# Parameters: text - string of text
# Description: Split text into sentences while properly retaining proper punctuation
# (Note: This function was taken from StackOverflow (https://stackoverflow.com/questions/4576077/python-split-text-on-sentences)

# Handle cases where periods are used but don't note the end of a sentence
caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [138]:
# read_file()
# Parameters: filename
# Description: Read a text file and turn it into a long string
def read_file(filename):
    
    # Open text file
    file_path = DIR_PATH + filename
    file = open(file_path)
    
    # Open text file and connect all lines
    content = ""
    with open(DIR_PATH + filename, "r+") as file:
        content = file.readlines()
    
    content = "".join(content)
    
    return content

In [184]:
# analyze_sentence_sentiment()
# Parameters: filename
# Description: Use VADER to conduct sentiment analysis on each sentence in the review, then return the aggregate score
def analyze_sentence_sentiment(text):
    
    sentences = split_into_sentences(text)

    # Use the example code from VADER to analyze the sentiment of each sentence
    analyzer = SentimentIntensityAnalyzer()
    all_scores = []
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        all_scores.append(vs)
        #print("{:-<65} {}".format(sentence, str(vs)))
    
    # Only grab the 'compound' score for each sentence and add it to a list
    sentence_ratings = []
    for scores_dict in all_scores:
        for key in scores_dict:
            if key == 'compound':
                sentence_ratings.append(scores_dict.get(key))
    
    return sentence_ratings

In [197]:
# rate_review()
# Parameters: sentence_ratings - list of rating of each sentence in the review
# Description: Use the ratings of each sentence, count the number of positive-, neutral-, and negative-rated sentences,
#              and rate the 
def rate_review(sentence_ratings):

    pos_count = 0
    neg_count = 0
    neu_count = 0
    review_rating = ""
    
    # Count number of positive-, negative-, and neutral-rated sentences
    for rating in sentence_ratings:
        if round(rating,3) >= 0.25:
            pos_count += 1
        elif round(rating,3) <= -0.25:
            neg_count += 1
        else:
            neu_count += 1
    
    if pos_count > neg_count and pos_count > neu_count:
        review_rating = "+"
    elif neg_count > pos_count and neg_count > neu_count:
        review_rating = "-"
    else:
        review_rating = "0"
    
    return review_rating

In [199]:
def main():

    filenames = []
    for i in range(1, NUM_FILES+1):
        filenames.append("review" + str(i) + ".txt")

    review_scores = []
    
    for filename in filenames:
        text = read_file(filename)
        sentence_sentiment = analyze_sentence_sentiment(text)
        review_rated = rate_review(sentence_sentiment)
        review_scores.append(review_rated)
        print("Overall rating of {}: {}".format(filename, review_rated))

    #print(review_scores)
    
main()

Overall rating of review1.txt: +
Overall rating of review2.txt: +
Overall rating of review3.txt: +
Overall rating of review4.txt: +
Overall rating of review5.txt: -
Overall rating of review6.txt: +
Overall rating of review7.txt: +
Overall rating of review8.txt: +


In [52]:
# nltk.download('stopwords')
# nltk.download('punky')eng_stops = nltk.corpus.stopwords.words('english')
symbolic_stops = ['.', ',', '!', '?', ';', ':', '[', ']']

eng_stops += symbolic_stops

# TODO: 
# Implement stop words reduction.

In [45]:
# sample_index = 0
# file_chosen = files[sample_index]

# words = process_file(file_chosen)
# nltk.word_tokenize(words)
# nltk.stem.snowball.SnowballStemmer("english")