# Sentiment Analysis using a Deep Neural Network

#### Feature Extraction

*(sfrees) Reference: The following code was adapted from MiniAssignment10KerasPOS.ipynb*

In [12]:
import re

# Get list of positive words
positive_words = []
with open('data/feature_list/positiveWords.txt', 'r') as file:
    for line in file:
        positive_words.append(line.split())
        
# Get list of negative words
negative_words = []
with open('data/feature_list/negativeWords.txt', 'r') as file:
    for line in file:
        negative_words.append(line.split())
        
# Get list of positive emoticons
positive_emotes = []
with open('data/feature_list/positiveEmotes.txt', 'r') as file:
    for line in file:
        positive_emotes.append(line.split())

# Get list of negative emoticons
negative_emotes = []
with open('data/feature_list/negativeEmotes.txt', 'r') as file:
    for line in file:
        negative_emotes.append(line.split())

        
def add_basic_features(text):
    """ Compute some text features.

        :param text: text on which we are selecting features 
        :type text: string
        :return: dict containing features
        :rtype: dict
    """
    
    # question marks
    nb_question_mark = text.count('?')
    question_mark = nb_question_mark > 0
    
    # exclaimation points
    nb_exclaimation_points = text.count('!')
    exclaimation_point = nb_exclaimation_points > 0
    
    # multiple periods
    elipses = re.findall(r'\.\.+')
    has_elipsis = len(elipses) > 0
    nb_elipsis_chars = 0
    for e in elipses:
        nb_elipsis_chars += len(e)
    
    split = text.split()
    
    # positive/negative words & emoticons
    nb_positive_words = 0
    nb_negative_words = 0
    for w in split:
        if w in positive_words:
            nb_positive_words += 1
        if w in negative_words:
            nb_negative_words += 1
        if w in positive_emotes:
            nb_positive_emotes += 1
        if w in negative_emotes:
            nb_negative_emotes += 1
            
    positive_word = nb_positive_words > 0
    negative_word = nb_negative_words > 0
    
    positive_emote = nb_positive_emotes > 0
    negative_emote = nb_negative_emotes > 0
    
    
    # determine if any word (len > 1) is all caps
    has_word_all_caps = False
    # count all such words
    nb_word_all_caps = 0
    for w in split:
        if len(w) > 1 and w.upper() == w:
            has_word_all_caps = True
            nb_word_all_caps += 1
    
    return {
        'nb_terms': len(text.split()),
        'nb_chars': len(text),
        
        'question_mark': question_mark,
        'nb_question_mark': nb_question_mark,
        'exclaimation_point': exclaimation_point,
        'nb_exclaimation_points': nb_exclaimation_points,
        'has_elipsis': has_elipsis,
        'nb_elipsis_chars': nb_elipsis_chars,
        
        'positive_word': positive_word,
        'nb_positive_word': nb_positive_word,
        'negative_word': negative_word,
        'nb_negatiev_word'
        
        'has_word_all_caps': has_word_all_caps,
        'nb_word_all_caps': nb_word_all_caps,
        'is_all_caps': text.upper() == text,
        'is_all_lower': text.lower() == text,
        'start_word-1': split[0],
        'start_word-2': split[1],
        'start_word-3': split[2],
        'end_word-1': split[-1],
        'end_word-2': split[-2],
        'end_word-3': split[-3],
    }

def transform_to_dataset(labeled_texts):
    """
    Split labeled texts to X and y datasets and append some basic features.

    :param labeled_texts: a list of sentiment-labled texts
    :param labeled_texts: list of list of tuples (text_i, label_i)
    :return: 
    """
    X, y = [], []

    for text_sentiment in labeled_texts:
        # Add basic NLP features for each text
        X.append(add_basic_features(text_sentiment[0]))
        y.append(text_sentiment[1])
    return X, y

def identify_labels(raw_texts):
    """
    Parse each text to identify its label and text portion.
    
    :param raw_texts: a list of raw lines from the input file
    :return: a list of tuples (text, label)
    """
    out = []
    for t in raw_texts:
        split = t.split('\t')
        if len(split) < 3:
            print(split)
            continue
        out.append((split[2], split[1]))
        
    return out

#### Vectorizing Features

*(sfrees) Reference: The following is based on code taken from NLP Assignment 2*

In [20]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
#from keras.utils import np_utils

dict_vectorizer = DictVectorizer(sparse=False)
label_encoder = LabelEncoder()

def processData(texts, verbose=False):
    
    if verbose:
        print("Transform to dataset")

    X_data, y_data = transform_to_dataset(texts)

    if verbose:
        print('Vectorize features')

    dict_vectorizer.fit(X_data)
    X_data = dict_vectorizer.transform(X_data)

    if verbose:
        print('Encode classes')
        
    label_encoder.fit(y_data)
    y_data = label_encoder.transform(y_data)
    
    return X_data, y_data


def prepForPrediction(texts):
    X = []
    for t in texts:
        X.append(add_basic_features(t))
    return dict_vectorizer.transform(X)


### Loading and Processing Data

In [21]:
train_data_filename = "./data/english/twitter-2016train-A.txt"
test_data_filename = "./data/english/twitter-2016test-A.txt"

data_raw = []

num_train = 0
num_test = 0

with open(train_data_filename, 'r') as file:
    for line in file:
        if len(line) > 3:
            data_raw.append(line)
            num_train += 1
        
with open(test_data_filename, 'r') as file:
    for line in file:
        if len(line) > 3:
            data_raw.append(line)
            num_test += 1
        
data_labeled = identify_labels(data_raw)

assert len(data_labeled) == (num_train+num_test) , "Data lost during labeling process: %d / %d" % (len(data_labeled), num_train+num_test)

data_X, data_y = processData(data_labeled, True)

assert len(data_X)+len(data_y) == (num_train+num_test) , "Data lost data processing. %d / %d" % (len(data_X)+len(data_y), num_train+num_test)

data = {
  'X_train': data_X[:num_train],
  'y_train': data_y[:num_train],
  'X_val': data_X[num_train+1:],
  'y_val': data[num_train+1:]
}



Transform to dataset


NameError: name 'sentence_terms' is not defined

### Defining the Network

*(sfrees) Reference: Much of the code in network.py and layers.py was implemented by Stephen in a Deep Learning assigment*

In [None]:
from src.network import *
from src.layers import *

hidden_dims = [100, 100, 100, 100, 100]
weight_scale = 2e-2

model = FullyConnectedNet(hidden_dims, weight_scale=weight_scale, normalization=None)

### Training the Network

*(sfrees) Reference: solver.py and optim.py were taken from a Deep Learning assigment, and only a few lines of code were written by Stephen.*

In [None]:
solver = Solver(model, data,
                num_epochs=10, batch_size=50,
                update_rule='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True,print_every=20)
bn_solver.train()

### Testing the Network