# Sentiment Analysis
Author: Joel Yin

In [1]:
# Library Imports
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import numpy as np
import re
import copy
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Tenkichi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tenkichi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load training data
def load_corpus(corpus_path):
    file = open(corpus_path, encoding="ISO-8859-1", mode="r")
    lines = file.readlines()
    dataset = list()
    for line in lines:
        snippet, label = line.split("\t")
        label = int(label.strip())
        dataset.append(tuple([snippet, label]))
    return dataset

re_lrquotes = r"^'(.+)'$"
re_lquotes = r"^'(.+)$"
re_rquotes = r"^(.+)'$"

def tokenize(snippet):
    blurb = copy.deepcopy(snippet)
    blurb = re.sub(re_lrquotes, r"' \1 '", blurb)
    blurb = re.sub(re_lquotes, r"' \1", blurb)
    blurb = re.sub(re_rquotes, r"\1 '", blurb)
    return blurb.split()
        
def tag_edits(tokenized_snippet):
    tokens = list()
    edited = False;
    for token in tokenized_snippet:
        temp = token
        if not edited:
            if temp.find("[") > -1:
                edited = True
                
        if edited:
            temp = re.sub(r"\[?(\w+)", r"EDIT_\1", temp)
            if temp.find("]") > -1:
                temp = re.sub(r"\]", "", temp)
                edited = False
        
        tokens.append(temp)
                
    return tokens
                
re_negation = r"^not$|^no$|^never$|^cannot$|^nothing$|^neither$|^nor$|^nobody$|^nowhere$|^except$|n't$"
re_negation_stop = r"^but$|^yet$|^however$|^nevertheless$|^though$|^except$|^\.$|^\?$|^\!$"
re_negation_quickstops = r"^only$|^just$"

def tag_negation(tokenized_snippet):
    # first remove any meta tags to make it "pure" again
    tokens = copy.deepcopy(tokenized_snippet)
    metas = list()
    for i, token in enumerate(tokens):
        pattern = r"(\w+)_(\w+)"
        match = re.search(pattern, token)
        if match:
            metas.append(tuple([i, match.group(1)]))
            tokens[i] = match.group(2)
            
    if "" in tokens:
        tokens.remove("")
    tokens = nltk.pos_tag(tokens)
    
    for i, edit in metas:
        word, pos = tokens[i]
        tokens[i] = tuple([edit + "_" + word, pos])
    # Using a variable that switches states for NOT/not_NOT sections
    negated = False
    indexes = range(0, len(tokens))
    for i in indexes:
        word, pos = tokens[i]
        if not negated:
            if re.search(re_negation, word):
                # When we find a not word, we need to check the next word for stop word
                if(i+1 < len(tokens)):
                    next_word = tokens[i+1][0]
                else:
                    next_word = ""
                if re.search(re_negation_quickstops, next_word):
                    i = i + 1
                else:
                    negated = True
        else:
            if re.search(re_negation_stop, word) or pos=="JJR" or pos=="RBR":
                negated = False
                # checking for the negation connection words
            elif not re.search(r"^or$|^nor$", word):
                tokens[i] = tuple(["NOT_" + word, pos])
                
    return tokens

In [3]:
def preprocess(corpus_path):
    dataset = load_corpus(corpus_path)
    for i, pair in enumerate(dataset):
        y = pair[1]
        X = pair[0]
        X = tag_negation(tag_edits(tokenize(X)))
        dataset[i] = tuple([X, y])
        
    return dataset
        

trn = preprocess("train.txt")
tst = preprocess("test.txt")


In [4]:
dictionary = dict()
dictionary_reverse = dict()

for datapoint, label in trn:
    for pair in datapoint:
        # first check if token has EDIT tag
        token = pair[0]
        if not re.search("EDIT", token):
            if token not in dictionary:
                dictionary_reverse[len(dictionary)] = token
                dictionary[token] = len(dictionary)
dictionary_size = len(dictionary.keys())
                
def get_features(preprocessed_snippet):
    feature_counts = np.zeros(dictionary_size)
    for (token, pos) in preprocessed_snippet:
        if not re.search("EDIT", token):
            if token in dictionary:
                feature_index = dictionary[token]
                feature_counts[feature_index] = feature_counts[feature_index] + 1
    return feature_counts

X_train = np.empty((len(trn), dictionary_size))
Y_train = np.empty(len(trn)).astype(int)
for i, (datapoint, label) in enumerate(trn):
    Y_train[i] = label
    features = get_features(datapoint)
    X_train[i] = features

In [5]:
# Normalize
def normalize(X):
    matrix = copy.deepcopy(X)
    feature_indexes = np.arange(X.shape[1])
    maxes = np.max(matrix, axis=0)
    mins = np.min(matrix, axis=0)
    for index in feature_indexes:
        column = matrix[:, index]
        if maxes[index] - mins[index] == 0:
            matrix[:, index] = np.zeros(X.shape[0])
        else:
            matrix[:, index] = (column - mins[index])/(maxes[index] - mins[index])
    return matrix

X_train_normalized = normalize(X_train)
classifier_Gaussian = GaussianNB()
model = classifier_Gaussian.fit(X_train_normalized, Y_train)

In [6]:
def evaluate_predictions(Y_pred, Y_true):
    tp, tn, fp, fn = np.zeros(4).astype(int)
    for index in range(len(Y_pred)):
        pred = Y_pred[index]
        true = Y_true[index]
        if(pred == 1):
            if(true == 1):
                tp = tp + 1
            else:
                fp = fp + 1
        else:
            if(true == 0):
                tn = tn + 1
            else:
                fn = fn + 1

    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    fmeasure = (2 * precision * recall)/(precision + recall)
    return (precision, recall, fmeasure)

In [7]:
tst = preprocess("test.txt")

# for datapoint, label in tst:
#     for pair in datapoint:
#         # first check if token has EDIT tag
#         token = pair[0]
#         if not re.search("EDIT", token):
#             if token not in dictionary:
#                 dictionary[token] = len(dictionary)

# dictionary_size = len(dictionary.keys())

X_test = np.empty((len(tst), dictionary_size))
Y_true = np.empty(len(tst)).astype(int)
for i, (datapoint, label) in enumerate(tst):
    Y_true[i] = label
    features = get_features(datapoint)
    X_test[i] = features
    
X_test_normalized = normalize(X_test)

Y_pred = model.predict(normalize(X_test_normalized))
precision, recall, fmeasure = evaluate_predictions(Y_pred, Y_true)

In [8]:
print("GaussianNB Scores:")
print("precision: {}".format(precision))
print("recall: {}".format(recall))
print("fmeasure: {}".format(fmeasure))

GaussianNB Scores:
precision: 0.6123348017621145
recall: 0.8398791540785498
fmeasure: 0.7082802547770701


In [9]:
classifier_LR = LogisticRegression()
model = classifier_LR.fit(X_train_normalized, Y_train)
Y_pred = model.predict(X_test)
precision, recall, fmeasure = evaluate_predictions(Y_pred, Y_true)
print("LogisticRegression Scores (pre DAL and WebNet):")
print("precision: {}".format(precision))
print("recall: {}".format(recall))
print("fmeasure: {}".format(fmeasure))



LogisticRegression Scores (pre DAL and WebNet):
precision: 0.6822660098522167
recall: 0.8368580060422961
fmeasure: 0.751696065128901


In [10]:
def by_coef(element):
    return element[1]

def top_features(logreg_model, k):
    coefs = np.abs(logreg_model.coef_.flatten())
    top_indexes = np.argsort(coefs)[-k::]
    top_features = list()
    for index in top_indexes:
        if index - dictionary_size + 1 == 1:
            word = "<activeness>"
        elif index - dictionary_size + 1 == 2:
            word = "<evaluation>"
        elif index - dictionary_size + 1 == 3:
            word = "<imagery>"
        else:
            word = dictionary_reverse[index]
            
        top_features.append(tuple([word, logreg_model.coef_.flatten()[index]]))
    top_features.reverse()
    return top_features
    
print(top_features(model, 10))

[('too', -3.7107639626498323), ('bad', -2.9583008069901195), ('dull', -2.5381501747452275), ('fails', -2.2214358327354966), ('funny', 2.1107629504287986), ('boring', -1.9831300308600917), ('engrossing', 1.9199729826132301), ('entertaining', 1.8785160899899584), ('worst', -1.8648488923424085), ('worth', 1.8586615698105307)]


In [11]:
def load_dal(dal_path):
    file = open(dal_path, encoding="ISO-8859-1", mode="r")
    lines = file.readlines()
    pattern = r"([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)"
    dictionary = dict()
    for line in lines[1::]:
        match = re.search(pattern, line)
        word = match[1]
        activeness = float(match[2])
        evaluation = float(match[3])
        imagery = float(match[4].strip())
        
        dictionary[word] = tuple([activeness, evaluation, imagery])
    
    return dictionary

dal = load_dal('dict_of_affect.txt')

def score_snippet(preprocessed_snippet, dal):
    scores = np.empty((len(preprocessed_snippet),3))
    index = 0
    for token, pos in preprocessed_snippet:
        if not re.search("EDIT_", token):
            match = re.search(r"(NOT_)?(.+)", token)
            if match[2] in dal:
                activeness, evaluation, imagery = dal[match[2]]
                if match[1]:
                    not_tag = -1
                else:
                    not_tag = 1

                scores[index] = not_tag * np.asarray([activeness, evaluation, imagery])
                index += 1
                
    if index == 0:
        return np.zeros(3)
    return np.average(scores[0:index], axis=0)
    

# The new get_features!
def get_features(preprocessed_snippet):
    feature_counts = np.zeros(dictionary_size + 3)
    for (token, pos) in preprocessed_snippet:
        if not re.search("EDIT", token):
            if token in dictionary:
                feature_index = dictionary[token]
                feature_counts[feature_index] = feature_counts[feature_index] + 1
    scores = score_snippet(preprocessed_snippet, dal)
    feature_counts[-3::] = scores
    return feature_counts

In [12]:
X_train = np.empty((len(trn), dictionary_size + 3))
Y_train = np.empty(len(trn)).astype(int)
for i, (datapoint, label) in enumerate(trn):
    Y_train[i] = label
    features = get_features(datapoint)
    X_train[i] = features

    
X_test = np.empty((len(tst), dictionary_size + 3))
Y_true = np.empty(len(tst)).astype(int)
for i, (datapoint, label) in enumerate(tst):
    Y_true[i] = label
    features = get_features(datapoint)
    X_test[i] = features

classifier_LR = LogisticRegression()
model_new = classifier_LR.fit(normalize(X_train), Y_train)
Y_pred = model_new.predict(normalize(X_test))
precision, recall, fmeasure = evaluate_predictions(Y_pred, Y_true)
print("Logistic Regression score (with DAL only):")
print("precision: {}".format(precision))
print("recall: {}".format(recall))
print("fmeasure: {}".format(fmeasure))
print(top_features(model_new, 10))



Logistic Regression score (with DAL only):
precision: 0.7243107769423559
recall: 0.8731117824773413
fmeasure: 0.7917808219178082
[('<evaluation>', 4.644123390530019), ('too', -3.762356773945614), ('bad', -2.8696322890617796), ('dull', -2.4411542290778008), ('fails', -2.149108086754406), ('boring', -1.970887148822075), ('engrossing', 1.9673533610089007), ('funny', 1.955179795668551), ('entertaining', 1.9073668232404466), ('worth', 1.7638530239227905)]


In [13]:
# get the wn pos tag fom nltk pos tag
def getWNTag(nltk_tag):
    wntag = ''
    if re.search(r"^NN", nltk_tag):
        return wn.NOUN
    if re.search(r"^JJ", nltk_tag):
        return wn.ADJ
    if re.search(r"^VB", nltk_tag):
        return wn.VERB
    if re.search(r"^RB", nltk_tag):
        return wn.ADV
    return ""

# Need the lemmatizer to get roots of words to be used in synonym/antonym search
wnl = WordNetLemmatizer()
def score_snippet(preprocessed_snippet, dal):
    scores = np.empty((len(preprocessed_snippet), 3))
    index = 0
    for token, pos in preprocessed_snippet:
        # if edit tag is present, ignore
        if not re.search("EDIT_", token):
            match = re.search(r"(NOT_)?(.+)", token)
            final = ""
            isAntonym = False
            if match[2] in dal:
                final = match[2]
            else:
                wntag = getWNTag(pos)
                # we enter this case if the token word is not in dal, so go through the synonyms/antonyms
                if wntag is not "":
                    # Basically, everytime something goes wrong, it throws errors
                    # If it can't lemmateize - error
                    # If it can't find a proper synonym, antonym within wn routings, error
                    # Since more often than not if it errors out in one plase, it errors out everywhere else
                    # I've opted to have a try-except statement the whole section here
                    try:
                        lemma = wnl.lemmatize(match[2], pos=wntag)
                        word = lemma + "." + wntag + ".01" # always use 1st definition since I don't know how to find correct one
                        word = wn.synset(word)
                        found = False
                        # First we go through hypernyms
                        for hypernym in word.hypernyms():
                            synonym = hypernym.name()
                            if synonym in dal:
                                final = synonym
                                break
                        # Then we go through hyponyms
                        if not found:
                            for hyponym in word.hyponyms():
                                synonym = hyponym.name()
                                if synonym in dal:
                                    final = synonym
                                    break
                        # Then the possible holonyms
                        if not found:
                            for holonym in word.member_holonyms():
                                synonym = holonym.name()
                                if synonym in dal:
                                    final = synonym
                                    break
                        # Then finally the antonyms
                        if not found:
                            for antonym in word.lemmas()[0].antonyms():
                                if antonym.name() in dal:
                                    final = antonym.name()
                                    isAntonym = True
                    except:
                        final = ""
                else:
                    final = ""
            
            if final is not "":
                activeness, evaluation, imagery = dal[final]
                if match[1]:
                    not_tag = -1
                else:
                    not_tag = 1
                    
                if isAntonym:
                    not_tag *= -1

                scores[index] = not_tag * np.asarray([activeness, evaluation, imagery])
                index += 1
            # of course if everything fails, then we skip the word by not appending anything
    # If there are no words, we just return [0,0,0]
    if index == 0:
        return np.zeros(3)
    return np.average(scores[0: index], axis=0)

In [14]:
X_train = np.empty((len(trn), dictionary_size + 3))
Y_train = np.empty(len(trn)).astype(int)
for i, (datapoint, label) in enumerate(trn):
    Y_train[i] = label
    features = get_features(datapoint)
    X_train[i] = features

    
X_test = np.empty((len(tst), dictionary_size + 3))
Y_true = np.empty(len(tst)).astype(int)
for i, (datapoint, label) in enumerate(tst):
    Y_true[i] = label
    features = get_features(datapoint)
    X_test[i] = features

classifier_LR = LogisticRegression()
model_new = classifier_LR.fit(normalize(X_train), Y_train)
Y_pred = model_new.predict(normalize(X_test))
precision, recall, fmeasure = evaluate_predictions(Y_pred, Y_true)
print("Logistic Regression Score (with DAL and WebNet):")
print("precision: {}".format(precision))
print("recall: {}".format(recall))
print("fmeasure: {}".format(fmeasure))
print(top_features(model_new, 10))



Logistic Regression Score (with DAL and WebNet):
precision: 0.7425474254742548
recall: 0.8277945619335347
fmeasure: 0.7828571428571428
[('<evaluation>', 4.643010494798919), ('too', -3.760428448374615), ('bad', -2.8750986550254027), ('dull', -2.4488438484317268), ('fails', -2.1588851186760625), ('funny', 1.9690625524063698), ('engrossing', 1.9600344824502434), ('boring', -1.9585515007958705), ('entertaining', 1.9131901405153682), ('worth', 1.7676686456282218)]
