In [11]:
import pandas as pd
import fasttext
from functools import reduce
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import score as s
from sklearn.preprocessing import normalize

Using TensorFlow backend.


In [2]:
train_bodies = pd.read_csv('fnc/train_bodies.csv')
train_stances = pd.read_csv('fnc/train_stances.csv')
train = train_stances.merge(train_bodies, on='Body ID')

test_bodies = pd.read_csv('fnc/competition_test_bodies.csv')
test_stances = pd.read_csv('fnc/competition_test_stances.csv')
test = test_stances.merge(test_bodies, on='Body ID')

In [3]:
# fasttext skipgram requires text file as input 
train_bd = train['articleBody']
train_headline = train['Headline']
test_bd = test['articleBody']
test_headline = test['Headline']
all_texts = np.concatenate((train_bd, train_headline, test_bd, test_headline))

In [4]:
with open('fnc/texts.txt', 'w') as f:
    f.writelines(all_texts)
    
word2vec = fasttext.skipgram('fnc/texts.txt', 'model')

In [5]:
def doc2vec(tokens, word2vec):
    word_vecs = np.array([word2vec[token] for token in tokens])
    vec = normalize(word_vecs).mean(axis=0)
    assert len(vec) == 100
    return vec

def get_tokens(text, stopwords, tokenizer):
    return list(filter(lambda x: x not in stopwords, tokenizer(text)))

def get_vectors(df, word2vec):
    sw = stopwords.words('english')
    headline_vecs = np.array(list(map(lambda text: doc2vec(get_tokens(text, sw, word_tokenize), word2vec), df['Headline'])))
    body_vecs = np.array(list(map(lambda text: doc2vec(get_tokens(text, sw, word_tokenize), word2vec), df['articleBody'])))
    return headline_vecs, body_vecs

train_headline_vectors, train_body_vectors = get_vectors(train, word2vec)
test_headline_vectors, test_body_vectors = get_vectors(test, word2vec)

In [6]:
train_cosine = np.array(list(map(lambda a: cosine_similarity(a[0].reshape(1, -1),a[1].reshape(1, -1))[0,0], zip(train_headline_vectors, train_body_vectors))))
test_cosine = np.array(list(map(lambda a: cosine_similarity(a[0].reshape(1, -1),a[1].reshape(1, -1))[0,0], zip(test_headline_vectors, test_body_vectors))))

In [7]:
def label(n):
    if n == "agree": return 0.0
    if n == "discuss": return 1.0
    if n == "disagree": return 2.0
    if n == "unrelated": return 3.0
    
def unlabel(n):
    if n == 0.0: return "agree"
    if n == 1.0: return "discuss"
    if n == 2.0: return "disagree"
    if n == 3.0: return "unrelated"

In [8]:
model = LogisticRegression(class_weight='balanced')
x_train = np.hstack((train_headline_vectors, train_body_vectors, np.expand_dims(train_cosine, axis=1)))
y_train = train.apply(lambda row: label(row['Stance']), axis=1)
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [9]:
x_test = np.hstack([test_headline_vectors, test_body_vectors, np.expand_dims(test_cosine, axis=1)])
y_test = test.apply(lambda row: label(row['Stance']), axis=1)
y_prediction = model.predict(x_test)
accuracy_score(y_test, y_prediction)

0.85365757683075594

In [10]:
actual = list(test['Stance'])
predicted = list(map(lambda x: unlabel(x), y_prediction))
s.report_score(actual, predicted)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    867    |    93     |    431    |    512    |
-------------------------------------------------------------
| disagree  |    197    |    69     |    117    |    314    |
-------------------------------------------------------------
|  discuss  |    501    |    342    |   2828    |    793    |
-------------------------------------------------------------
| unrelated |    84     |    43     |    292    |   17930   |
-------------------------------------------------------------
Score: 8666.75 out of 11651.25	(74.3847226692415%)


74.3847226692415