In [2]:
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from gensim.models.keyedvectors import KeyedVectors
import re

In [3]:
vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()
svd = TruncatedSVD(n_components=400)
embeddings = KeyedVectors.load_word2vec_format('../model_swm_300-6-10-low.w2v', binary=False)
regex = "([@][A-Za-z0-9]+)|([^0-9A-Za-z# \t])|(\w+:\/\/\S+)|(#[^A-Za-z0-9]+)"
tweets = []
labels = []

In [4]:
with open('../../../data/train_semeval2018task2/tweets_us.text', 'r') as file:
    for tweet in file:
        reg_tweet = ' '.join(re.sub(regex, " ", tweet).split())
        low_tweet = reg_tweet.lower()
        tweets.append(low_tweet)
with open('../../../data/train_semeval2018task2/tweets_us.labels', 'r') as file:
    for label in file:
        tweet_label = int(label)
        labels.append(tweet_label)

In [5]:
tweets_total = []

word_vectors = embeddings.wv
# del embeddings
for tweet in tweets:
    tokens = tweet.split()
    length = len(tokens)
    total = np.zeros(300)
    for word in tokens:
        if word not in word_vectors:
            continue
        total += word_vectors[word]
    total /= float(length+1)
    tweets_total.append(total)
X = np.asarray(tweets_total)

In [6]:
print X.shape

(488553, 300)


In [7]:
def get_map_from_file(path):
    map_path = path
    input_map = defaultdict(list)
    with open(map_path, 'rb') as f:
        lines = f.read().splitlines()
        for l in lines[1:]:
            splits = l.decode('utf-8').split('\t')
            input_map[splits[0]] = [float(num) for num in splits[1:]]
    return input_map

In [8]:
def get_values_from_map(input_map, tweets):
    avg_senti_per_tweet = []
    keys = input_map.keys()
    for tweet in tweets:
        words = tweet.split()
        avg_tweet_senti = np.zeros(10)
        for word in words:
            if input_map[word] != []:
                vec = np.asarray(input_map[word])
                avg_tweet_senti += vec
            else:
                avg_tweet_senti += np.zeros(10)
        if len(tweet) != 0:
            avg_tweet_senti /= float(len(tweet))
        avg_senti_per_tweet.append(avg_tweet_senti)
    return np.asarray(avg_senti_per_tweet)

In [9]:
input_map = get_map_from_file("../../../data/NRC-emotion-lexicon-wordlevel-v0.92_new.txt")
X_emotion = get_values_from_map(input_map, tweets)

In [10]:
input_map = get_map_from_file("../../../data/NRC-Hashtag-Emotion-Lexicon-v0.2_new_new.txt")
X_hash = get_values_from_map(input_map, tweets)

In [11]:
X_temp1 = np.concatenate([X_emotion, X_hash], axis=1)

In [12]:
# X = vectorizer.fit_transform(tweets)
y = np.asarray(labels)

In [13]:
X_tfidf = tfidf_vectorizer.fit_transform(tweets)

In [14]:
X_lsa = svd.fit_transform(X_tfidf)

In [15]:
X_final = np.concatenate([X_lsa, X, X_temp1], axis=1)

In [16]:
# print X_tfidf.shape
# print X_lsa.shape
# print X_final.shape
# print svd.explained_variance_ratio_
# plt.plot(np.cumsum(svd.explained_variance_ratio_))
# plt.xlabel('number of components')
# plt.ylabel('cumulative explained variance');
# plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=.20)

In [None]:
model = DecisionTreeClassifier() # use default parameters
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
def F_1(P, R):
    return (2*P*R/(P+R))

In [None]:
val_1 = 50
val_2 = 100
val_3 = 150
val_4 = 20000

sample_1 = X_test[:val_1]
sample_2 = X_test[:val_2]
sample_3 = X_test[:val_3]
sample_4 = X_test[:val_4]

y_1 = y_test[:val_1]
y_2 = y_test[:val_2]
y_3 = y_test[:val_3]
y_4 = y_test[:val_4]

C1 = model.predict(sample_1)
C2 = model.predict(sample_2)
C3 = model.predict(sample_3)
C4 = model.predict(sample_4)

P1 = precision_score(y_1, C1, average='macro')
P2 = precision_score(y_2, C2, average='macro')
P3 = precision_score(y_3, C3, average='macro')
P4 = precision_score(y_4, C4, average='macro')

R1 = recall_score(y_1, C1, average='macro')
R2 = recall_score(y_2, C2, average='macro')
R3 = recall_score(y_3, C3, average='macro')
R4 = recall_score(y_4, C4, average='macro')

print 'Precision'
print P1
print P2
print P3
print P4
print 
print 'Recall'
print R1
print R2
print R3
print R4
print
print 'F1'
print F_1(P1,R1)
print F_1(P2,R2)
print F_1(P3,R3)
print F_1(P4,R4)