In [5]:
import pandas as pd
import fasttext
from functools import reduce
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import score as s
from sklearn.preprocessing import normalize

In [6]:
train_bodies = pd.read_csv('data/train_bodies.csv')
train_stances = pd.read_csv('data/train_stances.csv')
train = train_stances.merge(train_bodies, on='Body ID')

test_bodies = pd.read_csv('data/competition_test_bodies.csv')
test_stances = pd.read_csv('data/competition_test_stances.csv')
test = test_stances.merge(test_bodies, on='Body ID')

It strange, but when I add `articleBody` that way (all train dataset, where one articleBody included more than once), fasttext train more accurate representations and final score is higher.

In [16]:
train_bd = train['articleBody']
train_headline = train['Headline']
test_bd = test['articleBody']
test_headline = test['Headline']
all_texts = np.concatenate((train_bd, train_headline, test_bd, test_headline))

Fasttext skipgram requires text file as input 

In [None]:
with open('data/texts.txt', 'w') as f:
    f.writelines(all_texts)
    
word2vec = fasttext.skipgram('data/texts.txt', 'model/fasttext')

In [None]:
def doc2vec(tokens, word2vec):
    word_vecs = np.array([word2vec[token] for token in tokens])
    vec = normalize(word_vecs).mean(axis=0)
    assert len(vec) == 100
    return vec

def get_tokens(text, stopwords, tokenizer):
    return list(filter(lambda x: x not in stopwords, tokenizer(text)))

def get_vectors(df, word2vec):
    sw = stopwords.words('english')
    headline_vecs = np.array(list(map(lambda text: doc2vec(get_tokens(text, sw, word_tokenize), word2vec), df['Headline'])))
    body_vecs = np.array(list(map(lambda text: doc2vec(get_tokens(text, sw, word_tokenize), word2vec), df['articleBody'])))
    return headline_vecs, body_vecs

train_headline_vectors, train_body_vectors = get_vectors(train, word2vec)
test_headline_vectors, test_body_vectors = get_vectors(test, word2vec)

In [None]:
train_cosine = np.array(list(map(lambda a: cosine_similarity(a[0].reshape(1, -1),a[1].reshape(1, -1))[0,0], zip(train_headline_vectors, train_body_vectors))))
test_cosine = np.array(list(map(lambda a: cosine_similarity(a[0].reshape(1, -1),a[1].reshape(1, -1))[0,0], zip(test_headline_vectors, test_body_vectors))))

In [None]:
def label(n):
    if n == "agree": return 0.0
    if n == "discuss": return 1.0
    if n == "disagree": return 2.0
    if n == "unrelated": return 3.0
    
def unlabel(n):
    if n == 0.0: return "agree"
    if n == 1.0: return "discuss"
    if n == 2.0: return "disagree"
    if n == 3.0: return "unrelated"

In [None]:
model = LogisticRegression(class_weight='balanced')
x_train = np.hstack((train_headline_vectors, train_body_vectors, np.expand_dims(train_cosine, axis=1)))
y_train = train.apply(lambda row: label(row['Stance']), axis=1)
model.fit(x_train, y_train)

In [None]:
x_test = np.hstack([test_headline_vectors, test_body_vectors, np.expand_dims(test_cosine, axis=1)])
y_test = test.apply(lambda row: label(row['Stance']), axis=1)
y_prediction = model.predict(x_test)
accuracy_score(y_test, y_prediction)

In [None]:
actual = list(test['Stance'])
predicted = list(map(lambda x: unlabel(x), y_prediction))
s.report_score(actual, predicted)