In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import lil_matrix
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import numpy as np
from functools import reduce
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
import scipy.sparse as sps
from scipy.spatial.distance import cosine
import score as s

In [21]:
train_bodies = pd.read_csv('data/train_bodies.csv')
train_stances = pd.read_csv('data/train_stances.csv')
train = train_stances.merge(train_bodies, on='Body ID')
del train['Body ID']
train.head()

Unnamed: 0,Headline,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,unrelated,Danny Boyle is directing the untitled film\n\n...
1,Seth Rogen to Play Apple’s Steve Wozniak,discuss,Danny Boyle is directing the untitled film\n\n...
2,Mexico police find mass grave near site 43 stu...,unrelated,Danny Boyle is directing the untitled film\n\n...
3,Mexico Says Missing Students Not Found In Firs...,unrelated,Danny Boyle is directing the untitled film\n\n...
4,New iOS 8 bug can delete all of your iCloud do...,unrelated,Danny Boyle is directing the untitled film\n\n...


In [22]:
test_bodies = pd.read_csv('data/competition_test_bodies.csv')
test_stances = pd.read_csv('data/competition_test_stances.csv')
test = test_stances.merge(test_bodies, on='Body ID')
del test['Body ID']
test.head()

Unnamed: 0,Headline,Stance,articleBody
0,Ferguson riots: Pregnant woman loses eye after...,unrelated,A RESPECTED senior French police officer inves...
1,Apple Stores to install safes to secure gold A...,unrelated,A RESPECTED senior French police officer inves...
2,Pregnant woman loses eye after police shoot be...,unrelated,A RESPECTED senior French police officer inves...
3,We just found out the #Ferguson Protester who ...,unrelated,A RESPECTED senior French police officer inves...
4,Police Chief In Charge of Paris Attacks Commit...,discuss,A RESPECTED senior French police officer inves...


Collect all texts and train vectorizer

In [23]:
def train_vectorizer(train, test):
    train_bd = train['articleBody']
    train_headline = train['Headline']
    test_bd = test['articleBody']
    test_headline = test['Headline']
    all_texts = np.concatenate((train_bd, train_headline, test_bd, test_headline))
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), use_idf=True)
    vectorizer.fit(all_texts)
    return vectorizer

In [None]:
vectorizer = train_vectorizer(train, test)

Prepare tf-idf vectors for texts

In [None]:
def extract_tfidf_vectors(df, vectorizer):
    bd = df['articleBody']
    headline = df['Headline']
    bd_vector = vectorizer.transform(bd)
    headline_vector = vectorizer.transform(headline)
    return bd_vector, headline_vector

In [None]:
train_bd_vector, train_headline_vector = extract_tfidf_vectors(train, vectorizer)
test_bd_vector, test_headline_vector = extract_tfidf_vectors(test, vectorizer)

In [None]:
def get_cosine(a, b):
    return np.array([cosine(a[i].toarray(), b[i].toarray()) for i in range(a.shape[0])])

train_cosine = get_cosine(train_bd_vector, train_headline_vector)
test_cosine = get_cosine(test_bd_vector, test_headline_vector)

Train first related-unrelated model

In [None]:
model_related = LogisticRegression(class_weight='balanced')
x_train = sps.hstack([train_bd_vector, train_headline_vector, np.expand_dims(train_cosine, axis = 1)])
y_train = (train['Stance'] != 'unrelated').astype(int)

model_related.fit(x_train, y_train)

In [None]:
x_test = sps.hstack([test_bd_vector, test_headline_vector, np.expand_dims(test_cosine, axis = 1)])
y_test = (test['Stance'] != 'unrelated').astype(int)

y_prediction = model_related.predict(x_test)

In [None]:
recall_score(y_test, y_prediction)

In [None]:
precision_score(y_test, y_prediction)

In [None]:
f1_score(y_test, y_prediction)

In [None]:
accuracy_score(y_test, y_prediction)

In [None]:
def label(n):
    if n == "agree": return 0.0
    if n == "discuss": return 1.0
    if n == "disagree": return 2.0
    if n == "unrelated": return 3.0
    
def unlabel(n):
    if n == 0.0: return "agree"
    if n == 1.0: return "discuss"
    if n == 2.0: return "disagree"
    if n == 3.0: return "unrelated"

Multiclass logistic regression

In [None]:
model = LogisticRegression(class_weight='balanced')
x_train = sps.hstack([train_bd_vector, train_headline_vector, np.expand_dims(train_cosine, axis = 1)])
y_train = train.apply(lambda row: label(row['Stance']), axis=1) 

model_related.fit(x_train, y_train)

In [None]:
y_test = test.apply(lambda row: label(row['Stance']), axis=1) 
x_test = sps.hstack([test_bd_vector, test_headline_vector, np.expand_dims(test_cosine, axis = 1)])

y_prediction = model_related.predict(x_test)

In [None]:
accuracy_score(y_test, y_prediction)

Evaluate it with competition metric

In [None]:
actual = list(test['Stance'])
predicted = list(map(lambda x: unlabel(x), y_prediction))
s.report_score(actual, predicted)