In [2]:
import gensim
import numpy as np

In [3]:
wv_model_path = "/home/v2john/Documents/GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True)

In [4]:
affect_intensity_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/" + \
    "lexicons/NRC-AffectIntensity-Lexicon.txt"

In [5]:
from nltk.corpus import stopwords

def remove_stopwords(string):
    split_string = \
        [word for word in string.split()
         if word not in stopwords.words('english')]
    
    return " ".join(split_string)

In [6]:
import re

def clean_str(string):
    string = re.sub(r"@[A-Za-z0-9_s(),!?\'\`]+", "", string) # removing any twitter handle mentions
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)

    return remove_stopwords(string.strip().lower())

In [7]:
class Tweet(object):

    def __init__(self, id, text, emotion, intensity):
        self.id = id
        self.text = text
        self.emotion = emotion
        self.intensity = intensity

    def __repr__(self):
        return \
            "id: " + self.id + \
            ", text: " + self.text + \
            ", emotion: " + self.emotion + \
            ", intensity: " + self.intensity

In [8]:
def read_training_data(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], clean_str(array[1]), array[2], float(array[3])))
    return train_list
            
def read_test_data(training_data_file_path):

    test_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            test_list.append(Tweet(array[0], clean_str(array[1]), array[2], None))
    return test_list

In [33]:
emotion = "fear"

In [34]:
training_data_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/dataset/" + \
    emotion + "-ratings-0to1.train.txt"
test_data_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/dataset/" + \
    emotion + "-ratings-0to1.dev.target.txt"
predictions_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/predictions/" + \
    emotion + "-pred.txt"
gold_set_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/dataset/gold-set/" + \
    emotion + "-ratings-0to1.dev.gold.txt"

In [35]:
word_list = list()
word_intensities = dict()

with open(affect_intensity_file_path) as affect_intensity_file:
    for line in affect_intensity_file:
        word_int_array = line.replace("\n", "").split("\t")
        
        if (word_int_array[2] == emotion):
            word_list.append(word_int_array[0])
            word_intensities[word_int_array[0]] = word_int_array[1]
            
print(len(word_list))

1765


In [36]:
training_tweets = read_training_data(training_data_file_path)

score_train = list()
tweet_train = list()
for tweet in training_tweets:
    tweet_train.append(tweet.text)
    score_train.append(float(tweet.intensity))
    
print(len(score_train))

1147


In [54]:
def vectorize_tweets(tweet_list):
    vectors = list()

    for tweet in tweet_list:
        x_vector = list()
        score = 0.0
        for word in word_list:
            if word in tweet:
                score += tweet.count(word) * float(word_intensities[word])
        x_vector.append(score)
        x_vector.append(score**2)
        x_vector.append(score**3)
        x_vector.append(score**4)

        vector_list = list()
        for word in tweet.split():
            try:
                vector_list.append(wv_model[word])
            except Exception as e:
                pass

        if len(vector_list) == 0:
            vec_rep = np.zeros(300).tolist()
        else:
            vec_rep = sum(vector_list) / float(len(vector_list))

        x_vector.extend(vec_rep)
        vectors.append(x_vector)
    
    return vectors

In [55]:
x_train = vectorize_tweets(tweet_train)
print(len(x_train))

1147


In [39]:
test_tweets = read_test_data(test_data_file_path)
tweet_test = list()
for tweet in test_tweets:
    tweet_test.append(tweet.text)

x_test = vectorize_tweets(tweet_test)
print(len(x_test))

110


# Simple Neural Network Implementation in Keras

In [24]:
import os    
os.environ['THEANO_FLAGS'] = "device=gpu,device=cuda0"    
import theano

In [25]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using Theano backend.


In [40]:
# define base model
_, dim_size = (np.array(x_train).shape)
print(dim_size)
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(10000, input_dim=dim_size, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

304


In [41]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)

In [42]:
estimator.fit(x_train, score_train)

<keras.callbacks.History at 0x7f9331dcae10>

In [43]:
y_test = estimator.predict(x_test)
print(len(y_test))

110


In [44]:
with open(predictions_file_path, 'w') as predictions_file:
    for i in range(len(y_test)):
        predictions_file.write(
            str(test_tweets[i].id) + "\t" + test_tweets[i].text + "\t" +
            test_tweets[i].emotion +"\t" + str(y_test[i]) + "\n"
        )

In [45]:
import numpy
import sys
import scipy.stats

def evaluate(pred,gold):
    
    f=open(pred, "rb")
    pred_lines=f.readlines()
    f.close()
    
    f=open(gold, "rb")
    gold_lines=f.readlines()
    f.close()
    

    if(len(pred_lines)==len(gold_lines)):       
        # align tweets ids with gold scores and predictions
        data_dic={}
        
        for line in gold_lines:
            line = line.decode()
            parts=line.split('\t')
            if len(parts)==4:   
                data_dic[int(parts[0])]=[float(line.split('\t')[3])]
            else:
                raise ValueError('Format problem.')
        
        
        for line in pred_lines:
            line = line.decode()
            parts=line.split('\t')
            if len(parts)==4:  
                if int(parts[0]) in data_dic:
                    try:
                        data_dic[int(parts[0])].append(float(line.split('\t')[3]))
                    except ValueError:
                        # Invalid predictions are replaced by a default value
                        data_dic[int(parts[0])].append(0.5)
                else:
                    raise ValueError('Invalid tweet id.')
            else:
                raise ValueError('Format problem.')
            
            
        
        # lists storing gold and prediction scores
        gold_scores=[]  
        pred_scores=[]
         
        
        # lists storing gold and prediction scores where gold score >= 0.5
        gold_scores_range_05_1=[]
        pred_scores_range_05_1=[]
         
            
        for id in data_dic:
            if(len(data_dic[id])==2):
                gold_scores.append(data_dic[id][0])
                pred_scores.append(data_dic[id][1])
                if(data_dic[id][0]>=0.5):
                    gold_scores_range_05_1.append(data_dic[id][0])
                    pred_scores_range_05_1.append(data_dic[id][1])
            else:
                raise ValueError('Repeated id in test data.')
                
      
        # return zero correlation if predictions are constant
        if numpy.std(pred_scores)==0 or numpy.std(gold_scores)==0:
            return (0,0,0,0)
        

        pears_corr=scipy.stats.pearsonr(pred_scores,gold_scores)[0]                                    
        spear_corr=scipy.stats.spearmanr(pred_scores,gold_scores)[0]   


        pears_corr_range_05_1=scipy.stats.pearsonr(pred_scores_range_05_1,gold_scores_range_05_1)[0]                                    
        spear_corr_range_05_1=scipy.stats.spearmanr(pred_scores_range_05_1,gold_scores_range_05_1)[0]           
        
      
        return (pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1)
                                           
                          
        
    else:
        raise ValueError('Predictions and gold data have different number of lines.')

In [53]:
pears_corr_sum = 0
spear_corr_sum = 0
pears_corr_range_05_1_sum = 0
spear_corr_range_05_1_sum = 0

for emotion in ['anger', 'fear', 'sadness', 'joy']:
    print("\n" + emotion)
    predictions_file_path = \
        "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/predictions/" + \
        emotion + "-pred.txt"
    gold_set_path = \
        "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/dataset/gold-set/" + \
        emotion + "-ratings-0to1.dev.gold.txt"
    print("pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1")
    pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1 = \
        evaluate(predictions_file_path, gold_set_path)
    print(pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1)
    pears_corr_sum += pears_corr
    spear_corr_sum += spear_corr
    pears_corr_range_05_1_sum += pears_corr_range_05_1
    spear_corr_range_05_1_sum += spear_corr_range_05_1
    
print("\n===============================\n")
print("Average Scores")
print("pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1")
print(pears_corr_sum/4,spear_corr_sum/4,pears_corr_range_05_1_sum/4,spear_corr_range_05_1_sum/4)


anger
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1
0.635174983139 0.676250149915 0.416256743893 0.472286392863

fear
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1
0.509426462749 0.475165293601 0.439536598752 0.431479714532

sadness
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1
0.517216069726 0.502060341706 0.190507809585 0.101002709624

joy
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1
0.68694204897 0.683766836489 0.636145850216 0.677440983208


Average Scores
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1
0.587189891146 0.584310655428 0.420611750611 0.420552450057
