# Build intial word2vec and Linear SVR models

In [1]:
import gensim

In [3]:
wv_model_path = "/home/v2john/Documents/GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True)

In [4]:
import re

def clean_str(string):
    string = re.sub(r"@[A-Za-z0-9_s(),!?\'\`]+", "", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)

    return string.strip().lower()

In [5]:
def read_input_data(training_data_file_path):

    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            yield Tweet(array[0], clean_str(array[1]),
                        array[2], float(array[3]))

In [6]:
class Tweet(object):

    def __init__(self, id, text, emotion, intensity):
        self.id = id
        self.text = text
        self.emotion = emotion
        self.intensity = intensity

    def __repr__(self):
        return \
            "id: " + self.id + \
            ", text: " + self.text + \
            ", emotion: " + self.emotion + \
            ", intensity: " + self.intensity

In [57]:
training_data_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/dataset/sadness-ratings-0to1.train.txt"
test_data_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/dataset/sadness-ratings-0to1.dev.target.txt"
predictions_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/predictions/sadness-pred.txt"

In [58]:
tweets = list(read_input_data(training_data_file_path))

In [59]:
print(len(tweets))

786


In [60]:
x_train = list()
y_train = list()
for tweet in tweets:
    split_text_list = tweet.text.split()

    vector_list = list()
    for word in split_text_list:
        try:
            vector_list.append(wv_model[word])
        except Exception as e:
            pass

    sentence_vector = sum(vector_list) / float(len(vector_list))

    x_train.append(sentence_vector)
    y_train.append(tweet.intensity)

In [61]:
print(len(x_train), len(y_train))

786 786


In [62]:
from sklearn import model_selection
from sklearn.svm import LinearSVR

scores = \
    model_selection.cross_val_score(
        LinearSVR(), x_train, y_train, cv=10, scoring='r2'
    )
mean_score = scores.mean()
print("Accuracy: %0.2f (+/- %0.2f)" % (mean_score, scores.std() * 2))

Accuracy: -66.59 (+/- 49.70)


# Predicting the scores for the test dataset

In [63]:
def read_test_data(training_data_file_path):

    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            yield Tweet(array[0], clean_str(array[1]), array[2], array[3])

In [64]:
test_tweets = list(read_test_data(test_data_file_path))

In [65]:
print(len(test_tweets))

74


In [66]:
ml_model = LinearSVR()
ml_model.fit(x_train, y_train)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)

In [67]:
x_test = list()
for tweet in test_tweets:
    split_text_list = tweet.text.split()

    vector_list = list()
    for word in split_text_list:
        try:
            vector_list.append(wv_model[word])
        except Exception:
            pass

    sentence_vector = sum(vector_list) / float(len(vector_list))
    x_test.append(sentence_vector)

In [68]:
y_test = ml_model.predict(X=x_test)

In [69]:
print(len(y_test))

74


In [70]:
with open(predictions_file_path, 'w') as predictions_file:
    for i in range(len(test_tweets)):
        predictions_file.write(
            str(test_tweets[i].id) + "\t" + test_tweets[i].text + "\t" +
            test_tweets[i].emotion +"\t" + str(y_test[i]) + "\n"
        )