In [2]:
from nltk.corpus import stopwords

def remove_stopwords(string):
    split_string = \
        [word for word in string.split()
         if word not in stopwords.words('english')]
    
    return " ".join(split_string)

In [3]:
import re

def clean_str(string):
    string = re.sub(r"@[A-Za-z0-9_s(),!?\'\`]+", "", string) # removing any twitter handle mentions
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)

    return remove_stopwords(string.strip().lower())

In [4]:
class Tweet(object):

    def __init__(self, id, text, emotion, intensity):
        self.id = id
        self.text = text
        self.emotion = emotion
        self.intensity = intensity

    def __repr__(self):
        return \
            "id: " + self.id + \
            ", text: " + self.text + \
            ", emotion: " + self.emotion + \
            ", intensity: " + self.intensity

In [5]:
def read_input_data(training_data_file_path):

    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            yield Tweet(array[0], clean_str(array[1]),
                        array[2], float(array[3]))

def read_test_data(training_data_file_path):

    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            yield Tweet(array[0], clean_str(array[1]),
                        array[2], None)

In [6]:
training_data_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/dataset/anger-ratings-0to1.train.txt"
test_data_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/dataset/anger-ratings-0to1.dev.target.txt"
predictions_file_path = \
    "/home/v2john/MEGA/Academic/Masters/UWaterloo/Research/WASSA-Task/predictions/anger-pred.txt"

In [7]:
training_tweets = read_input_data(training_data_file_path)

score_train = list()
tweet_train = list()
for tweet in training_tweets:
    tweet_train.append(tweet.text)
    score_train.append(tweet.intensity)
    
print(len(score_train))

857


In [21]:
import sklearn

vectorizer = \
    sklearn.feature_extraction.text.TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
x_train = vectorizer.fit_transform(tweet_train)
score_train = list(map(float, score_train))

# scores = \
#     sklearn.model_selection.cross_val_score(
#         sklearn.svm.LinearSVR(), x_train, score_train, cv=10, scoring='r2'
#     )

scores = \
    model_selection.cross_val_score(
        sklearn.ensemble.GradientBoostingRegressor(), x_train.toarray(), 
        score_train, cv=10, scoring='r2'
    )

mean_score = scores.mean()
print("Accuracy: %0.2f (+/- %0.2f)" % (mean_score, scores.std() * 2))

Accuracy: -11.56 (+/- 12.01)


In [49]:
test_tweets = list(read_test_data(test_data_file_path))
tweet_test = list()
for tweet in test_tweets:
    tweet_test.append(tweet.text)

print(len(tweet_test))
tweet_train.extend(tweet_test)

84


In [50]:
x_train = vectorizer.fit_transform(tweet_train)[:len(score_train)]
x_test = vectorizer.fit_transform(tweet_train)[len(score_train):]

ml_model = LinearSVR()
ml_model.fit(x_train, score_train)

y_test = ml_model.predict(X=x_test)

In [51]:
with open(predictions_file_path, 'w') as predictions_file:
    for i in range(x_test.shape[0]):
        predictions_file.write(
            str(test_tweets[i].id) + "\t" + test_tweets[i].text + "\t" +
            test_tweets[i].emotion +"\t" + str(y_test[i]) + "\n"
        )