In [1]:
import gensim
import numpy as np

# Load pre-trained word vectors

In [2]:
wv_model_path = "/home/v2john/GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True)

In [250]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

lmtzr = WordNetLemmatizer()

def remove_stopwords(string):
    split_string = \
        [word for word in string.split()
         if word not in stopwords.words('english')]
    
    return " ".join(split_string)

In [251]:
import re

def clean_str(string):
    string = re.sub(r"@[A-Za-z0-9_s(),!?\'\`]+", "", string) # removing any twitter handle mentions
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)

    return remove_stopwords(string.strip().lower())

# Load affect intensity lexica

In [252]:
affect_intensity_file_path = \
    "/home/v2john/WASSA-Task/" + \
    "lexicons/NRC-AffectIntensity-Lexicon.txt"

def get_word_affect_intensity_dict(emotion):
    word_intensities = dict()

    with open(affect_intensity_file_path) as affect_intensity_file:
        for line in affect_intensity_file:
            word_int_array = line.replace("\n", "").split("\t")

            if (word_int_array[2] == emotion):
                word_intensities[word_int_array[0]] = float(word_int_array[1])

    return word_intensities

In [253]:
sentiment_emotion_lex_file_path = \
    "/home/v2john/WASSA-Task/" + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emotion-Lexicon-v0.92/" + \
    "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"

def get_affect_presence_list(emotion):
    word_list = list()
    
    with open(sentiment_emotion_lex_file_path) as sentiment_emotion_lex_file:
        for line in sentiment_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")

            if (word_array[1] == emotion and word_array[2] == '1'):
                word_list.append(word_array[0])
                
    return word_list

In [254]:
hashtag_emotion_lex_file_path = \
    "/home/v2john/WASSA-Task/" + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Emotion-Lexicon-v0.2/" + \
    "NRC-Hashtag-Emotion-Lexicon-v0.2.txt"
    
def get_hashtag_emotion_intensity(emotion):
    hastag_intensities = dict()
    
    with open(hashtag_emotion_lex_file_path) as hashtag_emotion_lex_file:
        for line in hashtag_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")

            if (word_array[0] == emotion):
                hastag_intensities[word_array[1].replace("#", "")] = float(word_array[2])
                
    return hastag_intensities

In [255]:
print(clean_str("A night where depression is winning... #depression #fml #help"))

night depression winning depression fml help


In [256]:
class Tweet(object):

    def __init__(self, id, text, emotion, intensity):
        self.id = id
        self.text = text
        self.emotion = emotion
        self.intensity = intensity

    def __repr__(self):
        return \
            "id: " + self.id + \
            ", text: " + self.text + \
            ", emotion: " + self.emotion + \
            ", intensity: " + self.intensity

In [257]:
def read_training_data(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], clean_str(array[1]), array[2], float(array[3])))
    return train_list
            
def read_test_data(training_data_file_path):

    test_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            test_list.append(Tweet(array[0], clean_str(array[1]), array[2], None))
    return test_list

In [258]:
emotion = "sadness"

In [259]:
training_data_file_path = \
    "/home/v2john/WASSA-Task/dataset/" + \
    emotion + "-ratings-0to1.train.txt"
test_data_file_path = \
    "/home/v2john/WASSA-Task/dataset/" + \
    emotion + "-ratings-0to1.dev.target.txt"
predictions_file_path = \
    "/home/v2john/WASSA-Task/predictions/" + \
    emotion + "-pred.txt"
gold_set_path = \
    "/home/v2john/WASSA-Task/dataset/gold-set/" + \
    emotion + "-ratings-0to1.dev.gold.txt"

In [260]:
training_tweets = read_training_data(training_data_file_path)

score_train = list()
tweet_train = list()
for tweet in training_tweets:
    tweet_train.append(tweet.text)
    score_train.append(float(tweet.intensity))
print(len(score_train))

786


In [261]:
from sklearn.preprocessing import PolynomialFeatures

poly_emo_int = PolynomialFeatures(10)
word_intensities = get_word_affect_intensity_dict(emotion)

def get_emo_int_vector(tweet):
    score = 0.0
    for word in word_intensities.keys():
        if word in tweet:
            score += tweet.count(word) * float(word_intensities[word])
    
    return poly_emo_int.fit_transform(np.array([score]).reshape(1, -1))[0].tolist()

In [262]:
def get_word2vec_embedding(tweet):
    vector_list = list()
    for word in tweet.split():
        try:
            vector_list.append(wv_model[word])
        except Exception as e:
            pass

    if len(vector_list) == 0:
        vec_rep = np.zeros(300).tolist()
    else:
        vec_rep = sum(vector_list) / float(len(vector_list))

#     x_vector.extend(poly_2.fit_transform(np.array(vec_rep).reshape(1, -1))[0].tolist())
    return vec_rep

In [263]:
word_list = get_affect_presence_list(emotion)

def get_sentiment_emotion_feature(tweet):
    for word in word_list:
        if word in tweet.split():
            return [1.0]
    
    return [0.0]

In [264]:
poly_emo_int = PolynomialFeatures(10)
hashtag_emotion_intensities = get_hashtag_emotion_intensity(emotion)

def get_hashtag_emotion_vector(tweet):
    score = 0.0
    for word in hashtag_emotion_intensities.keys():
        if word in tweet:
            score += tweet.count(word) * float(hashtag_emotion_intensities[word])
    
    return poly_emo_int.fit_transform(np.array([score]).reshape(1, -1))[0].tolist()

In [265]:
def vectorize_tweets(tweet_list):
    vectors = list()

    for tweet in tweet_list:
        x_vector = list()
        x_vector.extend(get_emo_int_vector(tweet))
        x_vector.extend(get_word2vec_embedding(tweet))
        x_vector.extend(get_sentiment_emotion_feature(tweet))
        x_vector.extend(get_hashtag_emotion_vector(tweet))
        vectors.append(x_vector)
    
    return vectors

In [266]:
x_train = vectorize_tweets(tweet_train)
print(len(x_train))
print(len(x_train[0]))

786
323


In [267]:
test_tweets = read_test_data(test_data_file_path)
tweet_test = list()
for tweet in test_tweets:
    tweet_test.append(tweet.text)

x_test = vectorize_tweets(tweet_test)
print(len(x_test))

74


In [268]:
from sklearn import ensemble, svm

ml_model = ensemble.GradientBoostingRegressor(n_estimators=100)
ml_model.fit(x_train, score_train)

y_test = ml_model.predict(X=x_test)

In [269]:
import pandas as pd
y_gold = read_training_data(gold_set_path)

data_dict = dict()
for i in range(len(y_gold)):
#     pass
    if y_gold[i].intensity >= 0.5:
        print([tweet_test[i], str(y_test[i]), str(y_gold[i].intensity)])

["ball watching amp rojo 'd header equally dreadful ! !", '0.44409596986', '0.583']
["500 season i'm looking point depressing royals", '0.636485413893', '0.688']
['want watch netflix stuck class depressing', '0.630160579608', '0.667']
["buddha n't possess enough power deliver affliction !", '0.363420362236', '0.542']
['dreadful , even met catfish still thought !', '0.518253232775', '0.542']
['watching uni reveal depressing miss stingrays \\(', '0.77704229618', '0.667']
['wow watched seriously one depressing movies life', '0.606593446714', '0.667']
['really offer sacrifice daily keep safe \\? sad', '0.534663275518', '0.625']
["would frown bit , folding arms 'why every time i'm need assistance someone expects lil", '0.34722857535', '0.562']
["feel like appendix n't purpose depressed alone lonely broken cry hurt crying life", '0.805784050587', '0.833']
["angelina jolie ca n't keep man one today mourn love dead", '0.497463774149', '0.646']
["'re thief liberal mope , investigated financial 

In [270]:
with open(predictions_file_path, 'w') as predictions_file:
    for i in range(len(y_test)):
        predictions_file.write(
            str(test_tweets[i].id) + "\t" + test_tweets[i].text + "\t" +
            test_tweets[i].emotion +"\t" + str(y_test[i]) + "\n"
        )

In [247]:
for i in range(10):
    print(tweet_test[i], y_test[i])

ball watching &amp; rojo'd header equally dreadful!! 0.443461916625
pessimist someone who, opportunity knocks, complains noise #mikeshumor 0.395693925037
.500 season i'm looking point. #depressing #royals 0.543328477142
stars, shine,\nyou know feel.\nscent pine, \nyou know feel.\nfreedom mine,\ni know feel.\ni'm feelin' good. 0.53320158192
want watch netflix stuck class. #depressing 0.60698054845
buddha doesn't possess enough power deliver affliction! 0.440121384324
donating trump puts damper exciting season. really bad look, ricketts family. 0.462056878294
hello dear friends, back online friday night. miss you! #sober #sobriety #recovery 0.612037751261
dreadful, even met catfish still thought her! 0.484031998781
watching uni reveal depressing miss stingrays :( 0.748450264439


# Evaluation based on Pearson and Spearman co-efficients

In [248]:
import numpy
import sys
import scipy.stats

def evaluate(pred,gold):
    
    f=open(pred, "rb")
    pred_lines=f.readlines()
    f.close()
    
    f=open(gold, "rb")
    gold_lines=f.readlines()
    f.close()
    

    if(len(pred_lines)==len(gold_lines)):       
        # align tweets ids with gold scores and predictions
        data_dic={}
        
        for line in gold_lines:
            line = line.decode()
            parts=line.split('\t')
            if len(parts)==4:   
                data_dic[int(parts[0])]=[float(line.split('\t')[3])]
            else:
                raise ValueError('Format problem.')
        
        
        for line in pred_lines:
            line = line.decode()
            parts=line.split('\t')
            if len(parts)==4:  
                if int(parts[0]) in data_dic:
                    try:
                        data_dic[int(parts[0])].append(float(line.split('\t')[3]))
                    except ValueError:
                        # Invalid predictions are replaced by a default value
                        data_dic[int(parts[0])].append(0.5)
                else:
                    raise ValueError('Invalid tweet id.')
            else:
                raise ValueError('Format problem.')
            
            
        
        # lists storing gold and prediction scores
        gold_scores=[]  
        pred_scores=[]
         
        
        # lists storing gold and prediction scores where gold score >= 0.5
        gold_scores_range_05_1=[]
        pred_scores_range_05_1=[]
         
            
        for id in data_dic:
            if(len(data_dic[id])==2):
                gold_scores.append(data_dic[id][0])
                pred_scores.append(data_dic[id][1])
                if(data_dic[id][0]>=0.5):
                    gold_scores_range_05_1.append(data_dic[id][0])
                    pred_scores_range_05_1.append(data_dic[id][1])
            else:
                raise ValueError('Repeated id in test data.')
                
      
        # return zero correlation if predictions are constant
        if numpy.std(pred_scores)==0 or numpy.std(gold_scores)==0:
            return (0,0,0,0)
        

        pears_corr=scipy.stats.pearsonr(pred_scores,gold_scores)[0]                                    
        spear_corr=scipy.stats.spearmanr(pred_scores,gold_scores)[0]   


        pears_corr_range_05_1=scipy.stats.pearsonr(pred_scores_range_05_1,gold_scores_range_05_1)[0]                                    
        spear_corr_range_05_1=scipy.stats.spearmanr(pred_scores_range_05_1,gold_scores_range_05_1)[0]           
        
      
        return (pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1)
                                           
                          
        
    else:
        raise ValueError('Predictions and gold data have different number of lines.')

In [249]:
print(emotion)
print("pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1")
print(evaluate(predictions_file_path, gold_set_path))

sadness
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1
(0.33801056257582413, 0.38882479200282999, 0.063053061220949877, 0.075499019418294469)


# Simple Neural Network Implementation in Keras

In [None]:
import theano

In [None]:
import numpy
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
# define base model
_, dim_size = (np.array(x_train).shape)
print(dim_size)
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(500, activation='relu', input_dim=dim_size))
    model.add(Dropout(0.25))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(33, activation='relu'))
    model.add(Dense(1))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=1000, batch_size=5, verbose=0)
estimator.fit(x_train, score_train)

In [None]:
y_test = estimator.predict(x_test)
print(len(y_test))

In [None]:
with open(predictions_file_path, 'w') as predictions_file:
    for i in range(len(y_test)):
        predictions_file.write(
            str(test_tweets[i].id) + "\t" + test_tweets[i].text + "\t" +
            test_tweets[i].emotion +"\t" + str(y_test[i]) + "\n"
        )

# Overall Score estimation

In [271]:
pears_corr_sum = 0
spear_corr_sum = 0
pears_corr_range_05_1_sum = 0
spear_corr_range_05_1_sum = 0

for emotion in ['anger', 'fear', 'sadness', 'joy']:
    print("\n" + emotion)
    predictions_file_path = \
        "/home/v2john/WASSA-Task/predictions/" + \
        emotion + "-pred.txt"
    gold_set_path = \
        "/home/v2john/WASSA-Task/dataset/gold-set/" + \
        emotion + "-ratings-0to1.dev.gold.txt"
    print("pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1")
    pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1 = \
        evaluate(predictions_file_path, gold_set_path)
    print(pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1)
    pears_corr_sum += pears_corr
    spear_corr_sum += spear_corr
    pears_corr_range_05_1_sum += pears_corr_range_05_1
    spear_corr_range_05_1_sum += spear_corr_range_05_1
    
print("\n===============================\n")
print("Average Scores")
print("pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1")
print(pears_corr_sum/4,spear_corr_sum/4,pears_corr_range_05_1_sum/4,spear_corr_range_05_1_sum/4)


anger
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1
0.578436784908 0.558838108316 0.421587483664 0.449821862526

fear
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1
0.54237141208 0.519331573158 0.531272324025 0.497823195335

sadness
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1
0.53369511302 0.526159949535 0.210719221791 0.162130602022

joy
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1
0.71664509184 0.706909264537 0.717400076535 0.714220982745


Average Scores
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1
0.592787100462 0.577809723886 0.470244776504 0.455999160657
