# Sentiment Analysis using a Deep Neural Network

#### Feature Extraction

*(sfrees) Reference: The following code was adapted from MiniAssignment10KerasPOS.ipynb*

In [1]:
import re

positive_words = []
negative_words = []
positive_emotes = []
negative_emotes = []

def load_feature_lists():
    # Get list of positive words
    with open('data/feature_list/positiveWords.txt', 'r') as file:
        for line in file:
            positive_words.append(line.strip())

    # Get list of negative words
    with open('data/feature_list/negativeWords.txt', 'r') as file:
        for line in file:
            negative_words.append(line.strip())

    # Get list of positive emoticons
    with open('data/feature_list/positiveEmotes.txt', 'r') as file:
        for line in file:
            for e in line.split():
                positive_emotes.append(e.strip())

    # Get list of negative emoticons
    with open('data/feature_list/negativeEmotes.txt', 'r') as file:
        for line in file:
            for e in line.split():
                negative_emotes.append(e.strip())

        
def add_basic_features(text):
    """ Compute some text features.

        :param text: text on which we are selecting features 
        :type text: string
        :return: dict containing features
        :rtype: dict
    """
    
    # question marks
    nb_question_mark = text.count('?')
    question_mark = nb_question_mark > 0
    
    # exclaimation points
    nb_exclaimation_points = text.count('!')
    exclaimation_point = nb_exclaimation_points > 0
    
    # multiple periods
    elipses = re.findall(r'\.\.+', text)
    has_elipsis = len(elipses) > 0
    nb_elipsis_chars = 0
    for e in elipses:
        nb_elipsis_chars += len(e)
    
    split = text.split()
    
    # positive/negative words & emoticons
    nb_positive_words = 0
    nb_negative_words = 0
    nb_positive_emotes = 0
    nb_negative_emotes = 0
    for w in split:
        if w in positive_words:
            nb_positive_words += 1
        if w in negative_words:
            nb_negative_words += 1
        if w in positive_emotes:
            nb_positive_emotes += 1
        if w in negative_emotes:
            nb_negative_emotes += 1
            
    positive_word = nb_positive_words > 0
    negative_word = nb_negative_words > 0
    
    positive_emote = nb_positive_emotes > 0
    negative_emote = nb_negative_emotes > 0
    
    
    # determine if any word (len > 1) is all caps
    has_word_all_caps = False
    # count all such words
    nb_word_all_caps = 0
    for w in split:
        if len(w) > 1 and w.upper() == w:
            has_word_all_caps = True
            nb_word_all_caps += 1
    
    return {
        #'nb_terms': len(text.split()),
        #'nb_chars': len(text),
        
        'question_mark': question_mark,
        'nb_question_mark': nb_question_mark,
        'exclaimation_point': exclaimation_point,
        'nb_exclaimation_points': nb_exclaimation_points,
        'has_elipsis': has_elipsis,
        'nb_elipsis_chars': nb_elipsis_chars,
        
        'positive_word': positive_word,
        'nb_positive_words': nb_positive_words,
        'negative_word': negative_word,
        'nb_negative_words': nb_negative_words,
        'positive_emote': positive_emote,
        'nb_positive_emotes': nb_positive_emotes,
        'negative_emote': negative_emote,
        'nb_negative_emotes': nb_negative_emotes,
        
        'has_word_all_caps': has_word_all_caps,
        'nb_word_all_caps': nb_word_all_caps,
        'is_all_caps': text.upper() == text,
        'is_all_lower': text.lower() == text
#        'start_word_1': split[0],
#        'start_word_2': split[1],
#        'start_word_3': split[2],
#        'end_word_1': split[-1],
#        'end_word_2': split[-2],
#        'end_word_3': split[-3],
    }

def transform_to_dataset(labeled_texts):
    """
    Split labeled texts to X and y datasets and append some basic features.

    :param labeled_texts: a list of sentiment-labled texts
    :param labeled_texts: list of list of tuples (text_i, label_i)
    :return: 
    """
    X, y = [], []

    for text_sentiment in labeled_texts:
        # Add basic NLP features for each text
        X.append(add_basic_features(text_sentiment[0]))
        y.append(text_sentiment[1])
    return X, y

def identify_labels(raw_texts):
    """
    Parse each text to identify its label and text portion.
    
    :param raw_texts: a list of raw lines from the input file
    :return: a list of tuples (text, label)
    """
    out = []
    for t in raw_texts:
        split = t.split('\t')
        if len(split) < 3:
            print(split)
            continue
        out.append((split[2], split[1]))
        
    return out

#### Vectorizing Features

*(sfrees) Reference: The following is based on code taken from NLP Assignment 2*

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
#from keras.utils import np_utils

dict_vectorizer = DictVectorizer(sparse=False)
label_encoder = LabelEncoder()

def processData(texts, verbose=False):
    
    load_feature_lists()
    
    if verbose:
        print("Transform to dataset")

    X_data, y_data = transform_to_dataset(texts)

    if verbose:
        print('Vectorize features')

    dict_vectorizer.fit(X_data)
    X_data = dict_vectorizer.transform(X_data)

    if verbose:
        print('Encode classes')
        
    label_encoder.fit(y_data)
    y_data = label_encoder.transform(y_data)
    
    return X_data, y_data


def prepForPrediction(texts):
    X = []
    for t in texts:
        X.append(add_basic_features(t))
    return dict_vectorizer.transform(X)


### Loading and Processing Data

In [3]:
import codecs
import numpy as np

train_data_filenames = ["./data/english/twitter-2013test-A.txt",
                        "./data/english/twitter-2013train-A.txt",
                        "./data/english/twitter-2015test-A.txt",
                        "./data/english/twitter-2015train-A.txt",
                        "./data/english/twitter-2016train-A.txt",
                        "./data/english/twitter-2016test-A.txt"
                       ]
test_data_filename = "./data/english/SemEval2017-task4-test.subtask-A.english.txt"

data_raw = []

num_train = 0
num_test = 0

for filename in train_data_filenames:
    with codecs.open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            if len(line) > 3:
                data_raw.append(line)
                num_train += 1
        
with codecs.open(test_data_filename, 'r', encoding='utf-8') as file:
    for line in file:
        if len(line) > 3:
            data_raw.append(line)
            num_test += 1
        
data_labeled = identify_labels(data_raw)

assert len(data_labeled) == (num_train+num_test) , "Data lost during labeling process: %d / %d" % (len(data_labeled), num_train+num_test)

data_X, data_y = processData(data_labeled, True)

assert len(data_X) == (num_train+num_test) , "Data lost data processing. %d / %d" % (len(data_X)+len(data_y), num_train+num_test)

data = {
  'X_train': data_X[:num_train],
  'y_train': data_y[:num_train],
  'X_val': data_X[num_train+1:],
  'y_val': data_y[num_train+1:]
}

input_dims = len(data_X[0])
num_classes = np.max(data_y) + 1

print('input_dims: %d' % (input_dims))
print('num_classes: %d' % (num_classes))

Transform to dataset
Vectorize features
Encode classes
input_dims: 18
num_classes: 3


### Defining the Network

*(sfrees) Reference: Much of the code in network.py and layers.py was implemented by Stephen in a Deep Learning assigment*

In [4]:
from src.network import *

hidden_dims = [150, 150, 100, 100, 50]
weight_scale = 2e-2
reg = 2e-3
norm = 'batchnorm'

model = FullyConnectedNet(hidden_dims, 
                          input_dim=input_dims, 
                          num_classes=num_classes, 
                          reg=reg, 
                          weight_scale=weight_scale, 
                          normalization=norm)


### Training the Network

*(sfrees) Reference: solver.py and optim.py were taken from a Deep Learning assigment, and only a few lines of code were written by Stephen.*

In [5]:
from src.solver import *
from src.network import *

solver = Solver(model, data,
                num_epochs=8, batch_size=3000,
                update_rule='adam',
                optim_config={
                  'learning_rate': 3e-5,
                },
                verbose=True,print_every=10)
solver.train()

(Iteration 1 / 112) loss: 1.134302
(Epoch 0 / 8) train acc: 0.349000; val_acc: 0.197590
(Iteration 11 / 112) loss: 1.114164
(Epoch 1 / 8) train acc: 0.357000; val_acc: 0.281853
(Iteration 21 / 112) loss: 1.098855
(Epoch 2 / 8) train acc: 0.512000; val_acc: 0.386469
(Iteration 31 / 112) loss: 1.088102
(Iteration 41 / 112) loss: 1.076002
(Epoch 3 / 8) train acc: 0.511000; val_acc: 0.502972
(Iteration 51 / 112) loss: 1.072505
(Epoch 4 / 8) train acc: 0.519000; val_acc: 0.504193
(Iteration 61 / 112) loss: 1.064843
(Epoch 5 / 8) train acc: 0.519000; val_acc: 0.506879
(Iteration 71 / 112) loss: 1.056345
(Iteration 81 / 112) loss: 1.057532
(Epoch 6 / 8) train acc: 0.521000; val_acc: 0.509973
(Iteration 91 / 112) loss: 1.055009
(Epoch 7 / 8) train acc: 0.541000; val_acc: 0.510055
(Iteration 101 / 112) loss: 1.055041
(Iteration 111 / 112) loss: 1.044893
(Epoch 8 / 8) train acc: 0.548000; val_acc: 0.509810


### Analysis

In [51]:
offset = 70
size = 10
ind = np.random.randint(num_train+1, len(data_labeled), size)
print (ind)
X = data_X[ind]
Xraw = []
for i in ind:
    Xraw.append(data_labeled[i])

scores = solver.predict(X)

for i in range(len(X)):
    print("Raw text: \"%s\"" % (Xraw[i][0].strip()))
    print("Vectorized: %s" % (X[i]))
    print("True Label: %s" % (Xraw[i][1]))
    print("Predicted Label: %d (%s)" % (np.argmax(scores[i]), label_encoder.inverse_transform(np.argmax(scores[i]))))
    print()

[50147 51669 49572 53005 48023 52734 48515 52805 43591 47261]
Raw text: "Homeopathy can be safely used to treat animals as well as humans. It can be used to treat both acute and chronic... https://t.co/CDUkO9yMK6"
Vectorized: [0. 1. 0. 0. 0. 3. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.]
True Label: positive
Predicted Label: 2 (positive)

Raw text: "Retweeted The Linux Foundation (@linuxfoundation):Microsoft has joined The Linux Foundation after a display of... https://t.co/jGsqW8kPDf"
Vectorized: [0. 1. 0. 0. 0. 3. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
True Label: neutral
Predicted Label: 1 (neutral)

Raw text: "I liked a @YouTube video https://t.co/iCNLpUrduO Nougat 7.1 for Samsung Galaxy Note 4 (SM-N910V) CM 14.1 trltevzw"
Vectorized: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 0. 0. 0. 0. 0.]
True Label: positive
Predicted Label: 1 (neutral)

Raw text: "#Saakashvilli, like #Tymoshenko, is looking out for #1, and not the best interests of #Ukraine or Ukrainians.https://t.co/lzFhWxzLCf"
Vectoriz

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
