In [37]:
import gensim
import numpy as np

In [36]:
wv_model_path = "/home/v2john/Documents/GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True)

In [14]:
affect_intensity_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/" + \
    "lexicons/NRC-AffectIntensity-Lexicon.txt"

In [15]:
from nltk.corpus import stopwords

def remove_stopwords(string):
    split_string = \
        [word for word in string.split()
         if word not in stopwords.words('english')]
    
    return " ".join(split_string)

In [16]:
import re

def clean_str(string):
    string = re.sub(r"@[A-Za-z0-9_s(),!?\'\`]+", "", string) # removing any twitter handle mentions
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)

    return remove_stopwords(string.strip().lower())

In [17]:
class Tweet(object):

    def __init__(self, id, text, emotion, intensity):
        self.id = id
        self.text = text
        self.emotion = emotion
        self.intensity = intensity

    def __repr__(self):
        return \
            "id: " + self.id + \
            ", text: " + self.text + \
            ", emotion: " + self.emotion + \
            ", intensity: " + self.intensity

In [18]:
def read_training_data(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], clean_str(array[1]), array[2], float(array[3])))
    return train_list
            
def read_test_data(training_data_file_path):

    test_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            test_list.append(Tweet(array[0], clean_str(array[1]), array[2], None))
    return test_list

In [174]:
emotion = "anger"

In [186]:
training_data_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/dataset/" + \
    emotion + "-ratings-0to1.train.txt"
test_data_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/dataset/" + \
    emotion + "-ratings-0to1.dev.target.txt"
predictions_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/predictions/" + \
    emotion + "-pred.txt"
gold_set_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/dataset/gold-set/" + \
    emotion + "-ratings-0to1.dev.gold.txt"

In [176]:
word_list = list()
word_intensities = dict()

with open(affect_intensity_file_path) as affect_intensity_file:
    for line in affect_intensity_file:
        word_int_array = line.replace("\n", "").split("\t")
        
        if (word_int_array[2] == emotion):
            word_list.append(word_int_array[0])
            word_intensities[word_int_array[0]] = word_int_array[1]
            
print(len(word_list))

1483


In [177]:
training_tweets = read_training_data(training_data_file_path)

score_train = list()
tweet_train = list()
for tweet in training_tweets:
    tweet_train.append(tweet.text)
    score_train.append(float(tweet.intensity))
    
print(len(score_train))

857


In [178]:
x_train = list()

for tweet in tweet_train:
    x_vector = list()
    for word in word_list:
        if word in tweet:
            x_vector.append(word_intensities[word])
        else:
            x_vector.append(0.0)
#     print(len(x_vector))
    
    vector_list = list()
    for word in tweet.split():
        try:
            vector_list.append(wv_model[word])
        except Exception as e:
            pass
    
    if len(vector_list) == 0:
        vec_rep = np.zeros(300).tolist()
    else:
        vec_rep = sum(vector_list) / float(len(vector_list))
#     print(len(vec_rep))
    
    x_vector.extend(vec_rep)
    x_train.append(x_vector)

print(len(x_train[0]))

1783


In [90]:
from sklearn import model_selection, ensemble, svm

scores = \
    model_selection.cross_val_score(
        ensemble.GradientBoostingRegressor(), x_train, 
        score_train, cv=10, scoring='neg_mean_squared_error'
    )

mean_score = scores.mean()
print("Accuracy: %0.2f (+/- %0.2f)" % (mean_score, scores.std() * 2))

Accuracy: -0.03 (+/- 0.06)


In [179]:
test_tweets = read_test_data(test_data_file_path)
tweet_test = list()
for tweet in test_tweets:
    tweet_test.append(tweet.text)

print(len(tweet_test))

x_test = list()

for tweet in tweet_test:
    x_vector = list()
    for word in word_list:
        if word in tweet:
            x_vector.append(word_intensities[word])
        else:
            x_vector.append(0.0)
    
    vector_list = list()
    for word in tweet.split():
        try:
            vector_list.append(wv_model[word])
        except Exception as e:
            pass
    
    if len(vector_list) == 0:
        vec_rep = np.zeros(300).tolist()
    else:
        vec_rep = sum(vector_list) / float(len(vector_list))
        
    x_vector.extend(vec_rep)
    x_test.append(x_vector)

print(len(x_test))

84
84


In [81]:
ml_model = ensemble.GradientBoostingRegressor()
ml_model.fit(x_train, score_train)

y_test = ml_model.predict(X=x_test)

In [82]:
with open(predictions_file_path, 'w') as predictions_file:
    for i in range(len(y_test)):
        predictions_file.write(
            str(test_tweets[i].id) + "\t" + test_tweets[i].text + "\t" +
            test_tweets[i].emotion +"\t" + str(y_test[i]) + "\n"
        )

In [83]:
for i in range(10):
    print(tweet_test[i], y_test[i])

know going one nights takes act god fall asleep 0.542220526513
horrible lewis dunk begun networking neo geo holiday home mexico 0.450949337565
speaking ex cobblers , saw ricky holmes charlton last week tracking back amp defending dread seeing gorre ball 0.43966876062
ball watching amp rojo 'd header equally dreadful ! ! 0.378733039653
really jumanji 2 w rock , jack black , kevin hart kidding ! wtf ! thisisaterribleidea 0.363636023433
really jumanji 2 w rock , jack black , kevin hart kidding ! wtf ! thisisaterribleidea horrible 0.460589537486
losing villa ' things people worry things wo n't even matter months ' 0.514139692532
worrying worried \? n1peter 5 7 nthrow worry , cares faith leadership worry mindfulness success 0.53578353298
concerns amp anxiety n't matter shall return favor eyematter 0.507448003751
goes butterflies stomach nervous anxietyproblems 0.775157754385


# Simple Neural Network Implementation in Keras

In [196]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [197]:
# define base model
_, dim_size = (np.array(x_train).shape)
print(dim_size)
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(10000, input_dim=dim_size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

1783


In [198]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)

In [126]:
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, x_train[:20], score_train[:20], cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Results: 0.02 (0.01) MSE


In [199]:
estimator.fit(x_train, score_train)

<keras.callbacks.History at 0x7f833ea99860>

In [200]:
y_test = estimator.predict(x_test)
print(len(y_test))

84


In [201]:
with open(predictions_file_path, 'w') as predictions_file:
    for i in range(len(y_test)):
        predictions_file.write(
            str(test_tweets[i].id) + "\t" + test_tweets[i].text + "\t" +
            test_tweets[i].emotion +"\t" + str(y_test[i]) + "\n"
        )

In [202]:
from sklearn import metrics

y_gold = list()
with open(gold_set_path) as gold_set_file:    
    for line in gold_set_file:
        line = line.replace("\n", "")
        split_line = line.split("\t")
        y_gold.append(float(split_line[3]))
# 0.0185 -> previous
print(metrics.mean_squared_error(y_pred=y_test, y_true=y_gold))

0.0168215740726
