# Define evaluation logic

In [42]:
import numpy
import sys
import scipy.stats

def evaluate(pred,gold):
    
    f=open(pred, "rb")
    pred_lines=f.readlines()
    f.close()
    
    f=open(gold, "rb")
    gold_lines=f.readlines()
    f.close()
    

    if(len(pred_lines)==len(gold_lines)):       
        # align tweets ids with gold scores and predictions
        data_dic={}
        
        for line in gold_lines:
            line = line.decode()
            parts=line.split('\t')
            if len(parts)==4:   
                data_dic[int(parts[0])]=[float(line.split('\t')[3])]
            else:
                raise ValueError('Format problem.')
        
        
        for line in pred_lines:
            line = line.decode()
            parts=line.split('\t')
            if len(parts)==4:  
                if int(parts[0]) in data_dic:
                    try:
                        data_dic[int(parts[0])].append(float(line.split('\t')[3]))
                    except ValueError:
                        # Invalid predictions are replaced by a default value
                        data_dic[int(parts[0])].append(0.5)
                else:
                    raise ValueError('Invalid tweet id.')
            else:
                raise ValueError('Format problem.')
            
            
        # lists storing gold and prediction scores
        gold_scores=[]  
        pred_scores=[]
         
        
        # lists storing gold and prediction scores where gold score >= 0.5
        gold_scores_range_05_1=[]
        pred_scores_range_05_1=[]
         
            
        for id in data_dic:
            if(len(data_dic[id])==2):
                gold_scores.append(data_dic[id][0])
                pred_scores.append(data_dic[id][1])
                if(data_dic[id][0]>=0.5):
                    gold_scores_range_05_1.append(data_dic[id][0])
                    pred_scores_range_05_1.append(data_dic[id][1])
            else:
                raise ValueError('Repeated id in test data.')
                
      
        # return zero correlation if predictions are constant
        if numpy.std(pred_scores)==0 or numpy.std(gold_scores)==0:
            return (0,0,0,0)
        

        pears_corr=scipy.stats.pearsonr(pred_scores,gold_scores)[0]                                    
        spear_corr=scipy.stats.spearmanr(pred_scores,gold_scores)[0]   


        pears_corr_range_05_1=scipy.stats.pearsonr(pred_scores_range_05_1,gold_scores_range_05_1)[0]                                    
        spear_corr_range_05_1=scipy.stats.spearmanr(pred_scores_range_05_1,gold_scores_range_05_1)[0]           
        
      
        return (pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1)
    else:
        raise ValueError('Predictions and gold data have different number of lines.')
        
def evaluate_lists(pred, gold):
    if len(pred) == len(gold):
        gold_scores=gold
        pred_scores=pred         
        
        # lists storing gold and prediction scores where gold score >= 0.5
        gold_scores_range_05_1=[]
        pred_scores_range_05_1=[]
         
            
        for i in range(len(gold_scores)):
            if(gold_scores[i]>=0.5):
                gold_scores_range_05_1.append(gold_scores[i])
                pred_scores_range_05_1.append(pred_scores[i])
                
        # return zero correlation if predictions are constant
        if numpy.std(pred_scores)==0 or numpy.std(gold_scores)==0:
            return (0,0,0,0)
        
        pears_corr=scipy.stats.pearsonr(pred_scores,gold_scores)[0]                                    
        spear_corr=scipy.stats.spearmanr(pred_scores,gold_scores)[0]   


        pears_corr_range_05_1=scipy.stats.pearsonr(pred_scores_range_05_1,gold_scores_range_05_1)[0]                                    
        spear_corr_range_05_1=scipy.stats.spearmanr(pred_scores_range_05_1,gold_scores_range_05_1)[0]           
        
      
        return np.array([pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1])
    else:
        raise ValueError('Predictions and gold data have different number of lines.')

# Load pre-trained word vectors

In [3]:
import gc
gc.collect()

123

In [23]:
import gensim
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

In [4]:
word_vector_path = "/home/v2john/"
wassa_home = "/home/v2john/WASSA-Task/"

In [None]:
# Google news pretrained vectors
wv_model_path = word_vector_path + "GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True, unicode_errors='ignore')

In [None]:
# Twitter pretrained vectors
wv_model_path_1 = word_vector_path + "word2vec_twitter_model.bin"
wv_model_1 = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path_1, binary=True, unicode_errors='ignore')

In [None]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = [float(val) for val in splitLine[1:]]
        model[word] = np.array(embedding)
    print("Done.",len(model)," words loaded!")
    return model

wv_model_path_2 = word_vector_path + "glove.twitter.27B.200d.txt"
wv_model_2 = loadGloveModel(wv_model_path_2)

In [None]:
w2v_dimensions = len(wv_model['word'])
w2v_dimensions_1 = len(wv_model_1['word'])
w2v_dimensions_2 = len(wv_model_2['word'])
print(w2v_dimensions, w2v_dimensions_1, w2v_dimensions_2)

In [5]:
from nltk import word_tokenize
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn

lmtzr = WordNetLemmatizer()

# bi_tokens = list(bigrams(word_tokenize("This is a sample sentence!!!")))
# for bi_token in bi_tokens:
#     print(" ".join(bi_token))

def remove_stopwords(string):
    split_string = \
        [word for word in string.split()
         if word not in stopwords.words('english')]
    
    return " ".join(split_string)

In [6]:
# synsetlist = list(swn.senti_synsets('super'))
# print(synsetlist[0])

In [7]:
import re
import html

def clean_str(string):  
    string = html.unescape(string)
    string = string.replace("\\n", " ")
    string = string.replace("_NEG", "")
    string = string.replace("_NEGFIRST", "")
    string = re.sub(r"@[A-Za-z0-9_s(),!?\'\`]+", "", string) # removing any twitter handle mentions
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\*", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'m", " \'m", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " ,", string)
    string = re.sub(r"!", " !", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ?", string)
    string = re.sub(r"\s{2,}", " ", string)

    return remove_stopwords(string.strip().lower())

# Metadata and Class Definitions

In [8]:
class Tweet(object):

    def __init__(self, id, text, emotion, intensity):
        self.id = id
        self.text = text
        self.emotion = emotion
        self.intensity = intensity

    def __repr__(self):
        return \
            "id: " + self.id + \
            ", text: " + self.text + \
            ", emotion: " + self.emotion + \
            ", intensity: " + self.intensity

In [9]:
def read_training_data(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], clean_str(array[1]), array[2], float(array[3])))
    return train_list
            
def read_test_data(training_data_file_path):

    test_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            test_list.append(Tweet(array[0], clean_str(array[1]), array[2], None))
    return test_list

In [10]:
emotion = "sadness"

In [11]:
training_data_file_path = \
    wassa_home + "dataset/" + \
    emotion + "-ratings-0to1.train.txt"
test_data_file_path = \
    wassa_home + "dataset/" + \
    emotion + "-ratings-0to1.dev.target.txt"
predictions_file_path = \
    wassa_home + "predictions/" + \
    emotion + "-pred.txt"
gold_set_path = \
    wassa_home + "dataset/gold-set/" + \
    emotion + "-ratings-0to1.dev.gold.txt"

# Feature Extraction Snippets

## Emotion Intensity Lexicon

In [None]:
affect_intensity_file_path = \
    wassa_home + \
    "lexicons/NRC-AffectIntensity-Lexicon.txt"

def get_word_affect_intensity_dict(emotion):
    word_intensities = dict()

    with open(affect_intensity_file_path) as affect_intensity_file:
        for line in affect_intensity_file:
            word_int_array = line.replace("\n", "").split("\t")

            if (word_int_array[2] == emotion):
                word_intensities[word_int_array[0]] = float(word_int_array[1])

    return word_intensities

In [None]:
word_intensities = get_word_affect_intensity_dict(emotion)

In [None]:
poly_emo_int = PolynomialFeatures(10)

def get_emo_int_vector(tweet):
    score = 0.0
    for word in word_intensities.keys():
        if word in tweet:
            score += tweet.count(word) * float(word_intensities[word])
    
    return poly_emo_int.fit_transform(np.array([score]).reshape(1, -1))[0].tolist()

In [None]:
def get_word2vec_embedding(tweet, model, dimensions):
    vector_list = list()
    for word in tweet.split():
        try:
            vector_list.append(model[word])
        except Exception as e:
            pass

    if len(vector_list) == 0:
        vec_rep = np.zeros(dimensions).tolist()
    else:
        try:
            vec_rep = sum(vector_list) / float(len(vector_list))
        except Exception as e:
            print(vector_list)
            print(e)
            raise Exception

    return vec_rep

## SentiWordNet 

In [None]:
poly_sentiwordnet = PolynomialFeatures(5)

def get_sentiwordnetscore(tweet):
    
    tweet_score = np.zeros(2)
    
    for word in tweet.split():
        synsetlist = list(swn.senti_synsets(word))
        
        if synsetlist:
            tweet_score[0] += synsetlist[0].pos_score()
            tweet_score[1] += synsetlist[0].neg_score()
            
    sentiwordnetscore_list = poly_sentiwordnet.fit_transform(tweet_score.reshape(1, -1))[0].tolist()
    
    return sentiwordnetscore_list

## Sentiment Emotion Presence Lexicon

In [None]:
sentiment_emotion_lex_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emotion-Lexicon-v0.92/" + \
    "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"

def get_affect_presence_list(emotion):
    word_list = list()
    
    with open(sentiment_emotion_lex_file_path) as sentiment_emotion_lex_file:
        for line in sentiment_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")

            if (word_array[1] == emotion and word_array[2] == '1'):
                word_list.append(word_array[0])
                
    return word_list

In [None]:
word_list = get_affect_presence_list(emotion)

In [None]:
def get_sentiment_emotion_feature(tweet):
    for word in word_list:
        if word in tweet.split():
            return [1.0]
    
    return [0.0]

## Hashtag Emotion Intensity

In [None]:
hashtag_emotion_lex_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Emotion-Lexicon-v0.2/" + \
    "NRC-Hashtag-Emotion-Lexicon-v0.2.txt"
    
def get_hashtag_emotion_intensity(emotion):
    hastag_intensities = dict()
    
    with open(hashtag_emotion_lex_file_path) as hashtag_emotion_lex_file:
        for line in hashtag_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")

            if (word_array[0] == emotion):
                hastag_intensities[clean_str(word_array[1])] = float(word_array[2])
                
    return hastag_intensities

In [None]:
hashtag_emotion_intensities = get_hashtag_emotion_intensity(emotion)

In [None]:
poly_emo_int = PolynomialFeatures(10)

def get_hashtag_emotion_vector(tweet):
    score = 0.0
    for word in hashtag_emotion_intensities.keys():
        if word in tweet:
            score += tweet.count(word) * float(hashtag_emotion_intensities[word])
    
    return poly_emo_int.fit_transform(np.array([score]).reshape(1, -1))[0].tolist()

## Emoticon Sentiment Lexicon

In [12]:
emoticon_lexicon_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-unigrams.txt"
emoticon_lexicon_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-bigrams.txt"
emoticon_lexicon_pairs_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-pairs.txt"
pair_split_string = "---"
    
emoticon_lexicon_unigrams = dict()
emoticon_lexicon_bigrams = dict()
emoticon_lexicon_pairs = dict()

def get_emoticon_lexicon_unigram_dict():
    with open(emoticon_lexicon_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_lexicon_unigrams

def get_emoticon_lexicon_bigram_dict():
    with open(emoticon_lexicon_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_lexicon_bigrams

def get_emoticon_lexicon_pairs_dict():
    with open(emoticon_lexicon_pairs_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            pair = word_array[0].split(pair_split_string)
            token_1 = clean_str(pair[0])
            token_2 = clean_str(pair[1])
            if token_1 and token_2:
                token_1_dict = None
                if token_1 in emoticon_lexicon_pairs.keys():
                    token_1_dict = emoticon_lexicon_pairs[token_1]
                else:
                    token_1_dict = dict()
                    
                token_1_dict[token_2] = np.array([float(val) for val in word_array[1:]])
                emoticon_lexicon_pairs[token_1] = token_1_dict
    
    return emoticon_lexicon_pairs

In [32]:
emoticon_lexicon_unigram_dict = get_emoticon_lexicon_unigram_dict()

In [33]:
emoticon_lexicon_bigram_dict = get_emoticon_lexicon_bigram_dict()

In [15]:
emoticon_lexicon_pairs_dict = get_emoticon_lexicon_pairs_dict()

In [21]:
# print(len(emoticon_lexicon_pairs_dict))
# random_key = list(emoticon_lexicon_pairs_dict.keys())[0]
# print(random_key)
# print(emoticon_lexicon_pairs_dict[random_key])

In [34]:
poly_emoticon_lexicon = PolynomialFeatures(5)

def get_unigram_sentiment_emoticon_lexicon_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for token in tokens:
        word = clean_str(token)
        if word in emoticon_lexicon_unigram_dict.keys():
            vector_list += emoticon_lexicon_unigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
    return poly_emoticon_lexicon.fit_transform(vector_list.reshape(1, -1))[0].tolist()

def get_bigram_sentiment_emoticon_lexicon_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in emoticon_lexicon_bigram_dict.keys():
            vector_list += emoticon_lexicon_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
    return poly_emoticon_lexicon.fit_transform(vector_list.reshape(1, -1))[0].tolist()

def get_pair_sentiment_emoticon_lexicon_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    
    for i in range(len(tokens)):
        word_1 = clean_str(tokens[i])
        if word_1 in emoticon_lexicon_pairs_dict.keys():
            token_1_dict = emoticon_lexicon_pairs_dict[word_1]
            for j in range(i, len(tokens)):
                word_2 = clean_str(tokens[j])
                if word_2 in token_1_dict.keys():
                    vector_list += token_1_dict[word_2]
                    counter += 1
                    
    if counter > 0:
        vector_list /= counter
    return poly_emoticon_lexicon.fit_transform(vector_list.reshape(1, -1))[0].tolist()

def get_sentiment_emoticon_lexicon_vector(tweet):
    final_list = list()
    tokens = word_tokenize(tweet)
    
    # Adding unigram features
    final_list.extend(get_unigram_sentiment_emoticon_lexicon_vector(tokens))
    
    # Adding bigram features
    final_list.extend(get_bigram_sentiment_emoticon_lexicon_vector(tokens))
    
    # Adding pair features
    final_list.extend(get_pair_sentiment_emoticon_lexicon_vector(tokens))

    return final_list

## Emoticon Sentiment Aff-Neg Lexicon

In [None]:
emoticon_afflex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-unigrams.txt"
emoticon_afflex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-bigrams.txt"
    
emoticon_afflex_unigrams = dict()
emoticon_afflex_bigrams = dict()

def get_emoticon_afflex_unigram_dict():
    with open(emoticon_afflex_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_afflex_unigrams

def get_emoticon_afflex_bigram_dict():
    with open(emoticon_afflex_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_afflex_bigrams

In [None]:
emoticon_afflex_unigram_dict = get_emoticon_afflex_unigram_dict()

In [None]:
emoticon_afflex_bigram_dict = get_emoticon_afflex_bigram_dict()

In [None]:
poly_emoticon_lexicon = PolynomialFeatures(5)

def get_unigram_sentiment_emoticon_afflex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for token in tokens:
        word = clean_str(token)
        if word in emoticon_afflex_unigram_dict.keys():
            vector_list += emoticon_afflex_unigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
    return poly_emoticon_lexicon.fit_transform(vector_list.reshape(1, -1))[0].tolist()


def get_bigram_sentiment_emoticon_afflex_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in emoticon_afflex_bigram_dict.keys():
            vector_list += emoticon_afflex_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
    return poly_emoticon_lexicon.fit_transform(vector_list.reshape(1, -1))[0].tolist()

def get_sentiment_emoticon_afflex_vector(tweet):
    final_list = list()
    tokens = word_tokenize(tweet)
    
    # Adding unigram features
    final_list.extend(get_unigram_sentiment_emoticon_afflex_vector(tokens))
    
    # Adding bigram featunigram_list =ures
    final_list.extend(get_bigram_sentiment_emoticon_afflex_vector(tokens))

    return final_list

## Hashtag Sentiment Aff-Neg Lexicon

In [None]:
hashtag_affneglex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
    "HS-AFFLEX-NEGLEX-unigrams.txt"
hashtag_affneglex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
    "HS-AFFLEX-NEGLEX-bigrams.txt"
    
hashtag_affneglex_unigrams = dict()
hashtag_affneglex_bigrams = dict()

def get_hashtag_affneglex_unigram_dict():
    with open(hashtag_affneglex_unigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_unigrams[clean_str(word_array[0])] = np.array([float(val) for val in word_array[1:]])
    
    return hashtag_affneglex_unigrams

def get_hashtag_affneglex_bigram_dict():
    with open(hashtag_affneglex_bigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_bigrams[clean_str(word_array[0])] = np.array([float(val) for val in word_array[1:]])

    return hashtag_affneglex_bigrams

In [None]:
hashtag_affneglex_unigram_dict = get_hashtag_affneglex_unigram_dict()

In [None]:
hashtag_affneglex_bigram_dict = get_hashtag_affneglex_bigram_dict()

In [None]:
poly_hashtag_sent_affneglex = PolynomialFeatures(5)

def get_unigram_sentiment_hashtag_affneglex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for token in tokens:
        word = clean_str(token)
        if word in hashtag_affneglex_unigram_dict.keys():
            vector_list += hashtag_affneglex_unigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
    return poly_hashtag_sent_affneglex.fit_transform(vector_list.reshape(1, -1))[0].tolist()

def get_bigram_sentiment_hashtag_affneglex_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in hashtag_affneglex_bigram_dict.keys():
            vector_list += hashtag_affneglex_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
    return poly_hashtag_sent_affneglex.fit_transform(vector_list.reshape(1, -1))[0].tolist()

def get_sentiment_hashtag_affneglex_vector(tweet):
    final_list = list()
    tokens = word_tokenize(tweet)
    
    # Adding unigram features
    final_list.extend(get_unigram_sentiment_hashtag_affneglex_vector(tokens))
    # Adding bigram features
    final_list.extend(get_bigram_sentiment_hashtag_affneglex_vector(tokens))

    return final_list

## Hashtag Sentiment Lexicon

In [None]:
hash_sent_lex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-unigrams.txt"
hash_sent_lex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-bigrams.txt"
hash_sent_lex_pairs_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-pairs.txt"
pair_split_string = "---"

hash_sent_lex_unigrams = dict()
hash_sent_lex_bigrams = dict()
hash_sent_lex_pairs = dict()

def get_hash_sent_lex_unigram_dict():
    with open(hash_sent_lex_unigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_unigrams[word_array[0]] = np.arr
            word_array = line.replace("\n", "").split("\t")
            pair = word_array[0].split(pair_split_string)
            token_1 = clean_str(pair[0])
            token_2 = clean_str(pair[1])
            if token_1 and token_2:
                token_1_dict = None
                if token_1 in hash_sent_lex_pairs.keys():
                    token_1_dict = hash_sent_lex_pairs[token_1]
                else:
                    token_1_dict = dict()
                    
                token_1_dict[token_2] = np.array([float(val) for val in word_array[1:]])
                hash_sent_lex_pairs[token_1] = token_1_dictay([float(val) for val in word_array[1:]])
    
    return hash_sent_lex_unigrams

def get_hash_sent_lex_bigram_dict():
    with open(hash_sent_lex_bigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hash_sent_lex_bigrams

def get_hash_sent_lex_pairs_dict():
    with open(hash_sent_lex_pairs_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            pair = word_array[0].split(pair_split_string)
            token_1 = clean_str(pair[0])
            token_2 = clean_str(pair[1])
            if token_1 and token_2:
                token_1_dict = None
                if token_1 in hash_sent_lex_pairs.keys():
                    token_1_dict = hash_sent_lex_pairs[token_1]
                else:
                    token_1_dict = dict()
                    
                token_1_dict[token_2] = np.array([float(val) for val in word_array[1:]])
                hash_sent_lex_pairs[token_1] = token_1_dict
    
    return hash_sent_lex_pairs

In [None]:
hash_sent_lex_unigram_dict = get_hash_sent_lex_unigram_dict()

In [None]:
hash_sent_lex_bigram_dict = get_hash_sent_lex_bigram_dict()

In [None]:
hash_sent_lex_pairs_dict = get_hash_sent_lex_pairs_dict()

In [None]:
poly_hash_sent_lex = PolynomialFeatures(5)

def get_unigram_sentiment_hash_sent_lex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    
    for token in tokens:
        word = clean_str(token)
        if word in hash_sent_lex_unigram_dict.keys():
            vector_list += hash_sent_lex_unigram_dict[word]
            counter += 1

    if counter > 0:
        vector_list /= counter
    
    return poly_hash_sent_lex.fit_transform(vector_list.reshape(1, -1))[0].tolist()
    
def get_bigram_sentiment_hash_sent_lex_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in hash_sent_lex_bigram_dict.keys():
            vector_list += hash_sent_lex_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
    
    return poly_hash_sent_lex.fit_transform(vector_list.reshape(1, -1))[0].tolist()

def get_pair_sentiment_hash_sent_lex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    
    for i in range(len(tokens)):
        word_1 = clean_str(tokens[i])
        if word_1 in hash_sent_lex_pairs_dict.keys():
            token_1_dict = hash_sent_lex_pairs_dict[word_1]
            for j in range(i, len(tokens)):
                word_2 = clean_str(tokens[j])
                if word_2 in token_1_dict.keys():
                    vector_list += token_1_dict[word_2]
                    counter += 1
    if counter > 0:
        vector_list /= counter
    return poly_hash_sent_lex.fit_transform(vector_list.reshape(1, -1))[0].tolist()
    
def get_sentiment_hash_sent_lex_vector(tweet):
    final_list = list()
    
    tokens = word_tokenize(tweet)
    
    # Adding unigram features
    final_list.extend(get_unigram_sentiment_hash_sent_lex_vector(tokens))
    # Adding bigram features
#     final_list.extend(get_bigram_sentiment_hash_sent_lex_vector(tokens))
    # Adding pair features
#     final_list.extend(get_pair_sentiment_hash_sent_lex_vector(tokens))

    return final_list

# Reading & Vectorizing Data

In [35]:
training_tweets = read_training_data(training_data_file_path)

with open(training_data_file_path + ".cleaned", 'w') as cleaned_input_file:
    for tweet in training_tweets:
        cleaned_input_file.write(tweet.id + "\t" + tweet.text + "\n")

score_train = list()
tweet_train = list()
for tweet in training_tweets:
    tweet_train.append(tweet.text)
    score_train.append(float(tweet.intensity))
print(len(score_train))

786


In [36]:
def vectorize_tweets(tweet_list):
    vectors = list()

    for i in range(len(tweet_list)):
        
        if i%100==0:
            print("Vectorizing tweet " + str(i))
        
        x_vector = list()
#         x_vector.extend(get_word2vec_embedding(tweet_list[i], wv_model, w2v_dimensions))
#         x_vector.extend(get_word2vec_embedding(tweet_list[i], wv_model_1, w2v_dimensions_1))
#         x_vector.extend(get_word2vec_embedding(tweet_list[i], wv_model_2, w2v_dimensions_2))
#         x_vector.extend(get_emo_int_vector(tweet_list[i]))
#         x_vector.extend(get_sentiwordnetscore(tweet_list[i]))
#         x_vector.extend(get_sentiment_emotion_feature(tweet_list[i]))
#         x_vector.extend(get_hashtag_emotion_vector(tweet_list[i]))
        x_vector.extend(get_sentiment_emoticon_lexicon_vector(tweet_list[i]))
#         x_vector.extend(get_sentiment_emoticon_afflex_vector(tweet_list[i]))
#         x_vector.extend(get_sentiment_hashtag_affneglex_vector(tweet_list[i]))
#         x_vector.extend(get_sentiment_hash_sent_lex_vector(tweet_list[i]))
        vectors.append(x_vector)
    
    return vectors

In [37]:
x_train = vectorize_tweets(tweet_train)
print(len(x_train))
dimension = len(x_train[0])
print(dimension)

# with open("/tmp/dump.txt", 'w') as dump_file:
#     for i in range(len(x_train)):
#         if dimension != len(x_train[i]):
#             print(len(x_train[i]), i)
#         dump_file.write(str(x_train[i]))
#         dump_file.write("\n")

Vectorizing tweet 0
Vectorizing tweet 100
Vectorizing tweet 200
Vectorizing tweet 300
Vectorizing tweet 400
Vectorizing tweet 500
Vectorizing tweet 600
Vectorizing tweet 700
786
168


In [38]:
print(x_train[0])

[1.0, -1.1012499999999998, 44579.75, 31772.75, 1.2127515624999996, -49093.44968749999, -34989.74093749999, 1987354110.0625, 1416421251.8125, 1009507642.5625, -1.3355426582031245, 54064.16146835936, 38532.45220742186, -2188573713.7063274, -1559833903.5585153, -1111720291.3719528, 88595749388058.73, 63143705300488.3, 45003598328525.61, 32074833950227.67, 1.4707663523461907, -59538.15781703074, -42433.862993423325, 2410166802.2190933, 1717767086.293815, 1224281970.873363, -97566069013599.66, -69537005462162.72, -49560212659288.82, -35322410887688.21, 3.9495763587823114e+18, 2.8149305963694433e+18, 2.0062491625860895e+18, 1.429888078792662e+18, 1.0191056803920963e+18, -1.6196814455212423, 65566.3962960051, 46730.29162150743, -2654196190.943776, -1891691003.7810636, -1348240520.424291, 107444633501226.62, 76577627265206.7, 54578184191041.805, 38898804990066.65, -4.349470965109019e+18, -3.0999423192518487e+18, -2.2093818902979305e+18, -1.574664246770419e+18, -1.1222901305317956e+18, 1.760711

In [41]:
test_tweets = read_test_data(test_data_file_path)
with open(test_data_file_path + ".cleaned", 'w') as cleaned_input_file:
    for tweet in test_tweets:
        cleaned_input_file.write(tweet.id + "\t" + tweet.text + "\n")

tweet_test = list()
for tweet in test_tweets:
    tweet_test.append(tweet.text)

x_test = vectorize_tweets(tweet_test)
print(len(x_test))

Vectorizing tweet 0
74


# Model Training and Testing

In [43]:
from sklearn import ensemble, svm, model_selection
from sklearn.metrics import make_scorer
from xgboost import XGBRegressor

ml_model = ensemble.GradientBoostingRegressor(max_depth=3, n_estimators=100)
# ml_model = ensemble.AdaBoostRegressor()
# ml_model = XGBRegressor(max_depth=1, n_estimators=100)

x_train = np.array(x_train)
score_train = np.array(score_train)

kf = model_selection.KFold(n_splits=5, shuffle=True)

scores = np.zeros(4)
for train_index, test_index in kf.split(x_train):
    X_train, X_test = x_train[train_index], x_train[test_index]
    y_train, y_test = score_train[train_index], score_train[test_index]
    ml_model.fit(X_train, y_train)
    y_pred = ml_model.predict(X_test)
    scores += evaluate_lists(y_pred, y_test)

avg_scores = scores/5
print("### " + emotion)
print("| pears-corr | spear-corr | pears-corr-range-05-1 | spear-corr-range-05-1 |")
print("| --- | --- | --- | --- |")
print("| " + str(avg_scores[0]) + " | " + str(avg_scores[1]) + " | " + \
      str(avg_scores[2]) + " | " + str(avg_scores[3]) + " |")

ml_model.fit(x_train, score_train)

y_test = ml_model.predict(x_test)

### sadness
| pears-corr | spear-corr | pears-corr-range-05-1 | spear-corr-range-05-1 |
| --- | --- | --- | --- |
| 0.560408136441 | 0.55353645987 | 0.328826812316 | 0.307461319352 |


In [44]:
y_gold = read_training_data(gold_set_path)

data_dict = dict()
diff = 0
for i in range(len(y_gold)):
    if y_gold[i].intensity >= 0.5:
        diff += y_gold[i].intensity - y_test[i]
#         print([tweet_test[i], str(y_test[i]), str(y_gold[i].intensity)])
print(diff/len(y_gold))

0.0513240455264


In [45]:
with open(predictions_file_path, 'w') as predictions_file:
    for i in range(len(y_test)):
        predictions_file.write(
            str(test_tweets[i].id) + "\t" + test_tweets[i].text + "\t" +
            test_tweets[i].emotion +"\t" + str(y_test[i]) + "\n"
        )

# Evaluation based on Pearson and Spearman co-efficients

In [46]:
print("### " + emotion)
print("| pears-corr | spear-corr | pears-corr-range-05-1 | spear-corr-range-05-1 |")
print("| --- | --- | --- | --- |")
pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1 = \
    evaluate(predictions_file_path, gold_set_path)
print("| " + str(pears_corr) + " | " + str(spear_corr) + " | " + \
      str(pears_corr_range_05_1) + " | " + str(spear_corr_range_05_1) + " |")

### sadness
| pears-corr | spear-corr | pears-corr-range-05-1 | spear-corr-range-05-1 |
| --- | --- | --- | --- |
| 0.532916549661 | 0.532911581855 | 0.401649412521 | 0.352193817126 |


# Overall Score Estimation

In [None]:
pears_corr_sum = 0
spear_corr_sum = 0
pears_corr_range_05_1_sum = 0
spear_corr_range_05_1_sum = 0

for emotion in ['anger', 'fear', 'sadness', 'joy']:
    print("\n### " + emotion)
    predictions_file_path = \
        wassa_home + "predictions/" + \
        emotion + "-pred.txt"
    gold_set_path = \
        wassa_home + "dataset/gold-set/" + \
        emotion + "-ratings-0to1.dev.gold.txt"
    print("| pears-corr | spear-corr | pears-corr-range-05-1 | spear-corr-range-05-1 |")
    print("| --- | --- | --- | --- |")
    pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1 = \
        evaluate(predictions_file_path, gold_set_path)
    print("| " + str(pears_corr) + " | " + str(spear_corr) + " | " + \
          str(pears_corr_range_05_1) + " | " + str(spear_corr_range_05_1) + " |")
    pears_corr_sum += pears_corr
    spear_corr_sum += spear_corr
    pears_corr_range_05_1_sum += pears_corr_range_05_1
    spear_corr_range_05_1_sum += spear_corr_range_05_1
    
print("\n### Average Scores")
print("| pears-corr | spear-corr | pears-corr-range-05-1 | spear-corr-range-05-1 |")
print("| --- | --- | --- | --- |")
print("| " + str(pears_corr_sum/4) + " | " + str(spear_corr_sum/4) + " | " + \
      str(pears_corr_range_05_1_sum/4) + " | " + str(spear_corr_range_05_1_sum/4) + " |")

# Simple Neural Network Implementation in Keras

In [None]:
import theano

In [None]:
import numpy
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
# define base model
_, dim_size = (np.array(x_train).shape)
print(dim_size)
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(500, activation='relu', input_dim=dim_size))
    model.add(Dropout(0.25))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(33, activation='relu'))
    model.add(Dense(1))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [None]:
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=1000, batch_size=5, verbose=0)
estimator.fit(x_train, score_train)

In [None]:
y_test = estimator.predict(x_test)
print(len(y_test))

In [None]:
with open(predictions_file_path, 'w') as predictions_file:
    for i in range(len(y_test)):
        predictions_file.write(
            str(test_tweets[i].id) + "\t" + test_tweets[i].text + "\t" +
            test_tweets[i].emotion +"\t" + str(y_test[i]) + "\n"
        )