In [1]:
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from gensim.models.keyedvectors import KeyedVectors
import re

In [2]:
vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()
svd = TruncatedSVD(n_components=400)
embeddings = KeyedVectors.load_word2vec_format('../model_swm_300-6-10-low.w2v', binary=False)
regex = "([@][A-Za-z0-9]+)|([^0-9A-Za-z# \t])|(\w+:\/\/\S+)|(#[^A-Za-z0-9]+)"
tweets = []
labels = []

In [3]:
with open('../../../data/train_semeval2018task2/tweets_us.text', 'r') as file:
    for tweet in file:
        reg_tweet = ' '.join(re.sub(regex, " ", tweet).split())
        low_tweet = reg_tweet.lower()
        tweets.append(low_tweet)
with open('../../../data/train_semeval2018task2/tweets_us.labels', 'r') as file:
    for label in file:
        tweet_label = int(label)
        labels.append(tweet_label)

In [9]:
tweets_total = []

word_vectors = embeddings.wv
# del embeddings
for tweet in tweets:
    tokens = tweet.split()
    length = len(tokens)
    if length == 0:
        continue
    total = np.zeros(300)
    for word in tokens:
        if word not in word_vectors:
            continue
        total += word_vectors[word]
    total /= float(length)
    tweets_total.append(total)
X = np.asarray(tweets_total)

In [5]:
print X.shape

(488553, 300)


In [11]:
# X = vectorizer.fit_transform(tweets)
y = np.asarray(labels)

In [15]:
X_tfidf = tfidf_vectorizer.fit_transform(tweets)

In [20]:
X_lsa = svd.fit_transform(X_tfidf)

In [33]:
X_final = np.concatenate([X_lsa, X], axis=1)

In [35]:
# print X_tfidf.shape
# print X_lsa.shape
# print X_final.shape
# print svd.explained_variance_ratio_
# plt.plot(np.cumsum(svd.explained_variance_ratio_))
# plt.xlabel('number of components')
# plt.ylabel('cumulative explained variance');
# plt.show()

(488553, 400)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20)

In [None]:
model = LogisticRegression() # use default parameters
model.fit(X_train, y_train)

In [23]:
model.score(X_test, y_test)

0.2958520535047231

In [13]:
def F_1(P, R):
    return (2*P*R/(P+R))

In [19]:
val_1 = 50
val_2 = 100
val_3 = 150
val_4 = 20000

sample_1 = X_test[:val_1]
sample_2 = X_test[:val_2]
sample_3 = X_test[:val_3]
sample_4 = X_test[:val_4]

y_1 = y_test[:val_1]
y_2 = y_test[:val_2]
y_3 = y_test[:val_3]
y_4 = y_test[:val_4]

C1 = model.predict(sample_1)
C2 = model.predict(sample_2)
C3 = model.predict(sample_3)
C4 = model.predict(sample_4)

P1 = precision_score(y_1, C1, average='macro')
P2 = precision_score(y_2, C2, average='macro')
P3 = precision_score(y_3, C3, average='macro')
P4 = precision_score(y_4, C4, average='macro')

R1 = recall_score(y_1, C1, average='macro')
R2 = recall_score(y_2, C2, average='macro')
R3 = recall_score(y_3, C3, average='macro')
R4 = recall_score(y_4, C4, average='macro')

print 'Precision'
print P1
print P2
print P3
print P4
print 
print 'Recall'
print R1
print R2
print R3
print R4
print
print 'F1'
print F_1(P1,R1)
print F_1(P2,R2)
print F_1(P3,R3)
print F_1(P4,R4)

Precision
0.198739495798
0.23348882239
0.256892230576
0.345279083774

Recall
0.23431372549
0.237030075188
0.205917981228
0.216231807227

F1
0.21506544403
0.235246122575
0.228597935673
0.265926526017
