In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import lil_matrix
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import numpy as np
from functools import reduce
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
import scipy.sparse as sps
from scipy.spatial.distance import cosine
import score as s

In [27]:
train_bodies = pd.read_csv('fnc/train_bodies.csv')
train_stances = pd.read_csv('fnc/train_stances.csv')
train = train_stances.merge(train_bodies, on='Body ID')
del train['Body ID']
train.head()

Unnamed: 0,Headline,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,unrelated,Danny Boyle is directing the untitled film\n\n...
1,Seth Rogen to Play Apple’s Steve Wozniak,discuss,Danny Boyle is directing the untitled film\n\n...
2,Mexico police find mass grave near site 43 stu...,unrelated,Danny Boyle is directing the untitled film\n\n...
3,Mexico Says Missing Students Not Found In Firs...,unrelated,Danny Boyle is directing the untitled film\n\n...
4,New iOS 8 bug can delete all of your iCloud do...,unrelated,Danny Boyle is directing the untitled film\n\n...


In [28]:
test_bodies = pd.read_csv('fnc/competition_test_bodies.csv')
test_stances = pd.read_csv('fnc/competition_test_stances.csv')
test = test_stances.merge(test_bodies, on='Body ID')
del test['Body ID']
test.head()

Unnamed: 0,Headline,Stance,articleBody
0,Ferguson riots: Pregnant woman loses eye after...,unrelated,A RESPECTED senior French police officer inves...
1,Apple Stores to install safes to secure gold A...,unrelated,A RESPECTED senior French police officer inves...
2,Pregnant woman loses eye after police shoot be...,unrelated,A RESPECTED senior French police officer inves...
3,We just found out the #Ferguson Protester who ...,unrelated,A RESPECTED senior French police officer inves...
4,Police Chief In Charge of Paris Attacks Commit...,discuss,A RESPECTED senior French police officer inves...


In [29]:
# collect all texts and train vectorizer
def train_vectorizer(train, test):
    train_bd = train['articleBody']
    train_headline = train['Headline']
    test_bd = test['articleBody']
    test_headline = test['Headline']
    all_texts = np.concatenate((train_bd, train_headline, test_bd, test_headline))
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), use_idf=True)
    vectorizer.fit(all_texts)
    return vectorizer

In [30]:
vectorizer = train_vectorizer(train,test)

In [31]:
# prepare tf-idf vectors for texts
def extract_tfidf_vectors(df, vectorizer):
    bd = df['articleBody']
    headline = df['Headline']
    bd_vector = vectorizer.transform(bd)
    headline_vector = vectorizer.transform(headline)
    return bd_vector, headline_vector

In [32]:
train_bd_vector, train_headline_vector = extract_tfidf_vectors(train, vectorizer)
test_bd_vector, test_headline_vector = extract_tfidf_vectors(test, vectorizer)

In [33]:
def get_cosine(a, b):
    return np.array([cosine(a[i].toarray(), b[i].toarray()) for i in range(a.shape[0])])

train_cosine = get_cosine(train_bd_vector, train_headline_vector)
test_cosine = get_cosine(test_bd_vector, test_headline_vector)

In [34]:
# train first related-unrelated model
model_related = LogisticRegression(class_weight='balanced')
x_train = sps.hstack([train_bd_vector, train_headline_vector, np.expand_dims(train_cosine, axis = 1)])
y_train = (train['Stance'] != 'unrelated').astype(int)

model_related.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [35]:
x_test = sps.hstack([test_bd_vector, test_headline_vector, np.expand_dims(test_cosine, axis = 1)])
y_test = (test['Stance'] != 'unrelated').astype(int)

y_prediction = model_related.predict(x_test)

In [36]:
recall_score(y_test, y_prediction)

0.88788221970554926

In [37]:
precision_score(y_test, y_prediction)

0.96344086021505382

In [38]:
f1_score(y_test, y_prediction)

0.9241196404891705

In [39]:
accuracy_score(y_test, y_prediction)

0.95946956282217766

In [40]:
# now prepare discuss-agree-disagree classification
train_d = train[train['Stance'] != 'unrelated']
test_d = test[test['Stance'] != 'unrelated']

vectorizer_d = train_vectorizer(train_d, test_d)

train_bd_d_vector, train_headline_d_vector = extract_tfidf_vectors(train_d, vectorizer_d)
test_bd_d_vector, test_headline_d_vector = extract_tfidf_vectors(test_d, vectorizer_d)

In [41]:
def label(n):
    if n == "agree": return 0.0
    if n == "discuss": return 1.0
    if n == "disagree": return 2.0
    if n == "unrelated": return 3.0
    
def unlabel(n):
    if n == 0.0: return "agree"
    if n == 1.0: return "discuss"
    if n == 2.0: return "disagree"
    if n == 3.0: return "unrelated"

In [42]:
train_d_cosine = get_cosine(train_bd_d_vector, train_headline_d_vector)
test_d_cosine = get_cosine(test_bd_d_vector, test_headline_d_vector)

model_discuss = LogisticRegression(class_weight='balanced', dual=True)
x_train = sps.hstack([train_bd_d_vector, train_headline_d_vector, np.expand_dims(train_d_cosine, axis = 1)])
y_train = train_d.apply(lambda row: label(row['Stance']), axis=1)
model_discuss.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=True,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [43]:
x_test = sps.hstack([test_bd_d_vector, test_headline_d_vector, np.expand_dims(test_d_cosine, axis = 1)])
y_test = test_d.apply(lambda row: label(row['Stance']), axis=1)
y_prediction = model_discuss.predict(x_test)
y_prediction

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [44]:
accuracy_score(y_test, y_prediction)

0.67483012457531144

In [45]:
# full model. it should first predict 'related-unrelated' label, then for the related - predict 'agree-discuss-disagree'
def predict(df, model, model_d, vectorizer, vectorizer_d):
    bd_vector, headline_vector = extract_tfidf_vectors(df, vectorizer)
    cosine = get_cosine(bd_vector, headline_vector)
    x_rel = sps.hstack([bd_vector, headline_vector, np.expand_dims(cosine, axis = 1)])
    y_rel = model.predict(x_rel)
    df.ix[y_rel==0, 'y'] = 3
    
    df_d = df[y_rel == 1]
    bd_d_vector, headline_d_vector = extract_tfidf_vectors(df_d, vectorizer_d)
    cosine_d = get_cosine(bd_d_vector, headline_d_vector)
    x_d = sps.hstack([bd_d_vector, headline_d_vector, np.expand_dims(cosine_d, axis = 1)])
    y_d_pred = model.predict(x_d)
    
    df.ix[y_rel == 1, 'y'] = y_d_pred
    #df.ix[y_d_pred==1, 'y'] = 1
    #df.ix[y_d_pred==2, 'y'] = 2
    return df['y']

y_prediction = predict(test, model_related, model_discuss, vectorizer, vectorizer_d)
y_test = test.apply(lambda row: label(row['Stance']), axis=1)
accuracy_score(y_test, y_prediction)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  import sys


0.87022390115295323

In [46]:
# it's a really complex model, why don't try usual multiclass logistic regression?
model = LogisticRegression(class_weight='balanced')
x_train = sps.hstack([train_bd_vector, train_headline_vector, np.expand_dims(train_cosine, axis = 1)])
y_train = train.apply(lambda row: label(row['Stance']), axis=1) 

model_related.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [47]:
y_test = test.apply(lambda row: label(row['Stance']), axis=1) 
x_test = sps.hstack([test_bd_vector, test_headline_vector, np.expand_dims(test_cosine, axis = 1)])

y_prediction = model_related.predict(x_test)

In [48]:
accuracy_score(y_test, y_prediction)

0.87640184157714551

In [49]:
# looks like accuracy is better here. let's evaluate it with competition metric
actual = list(test['Stance'])
predicted = list(map(lambda x: unlabel(x), y_prediction))
s.report_score(actual, predicted)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |   1032    |     1     |    511    |    359    |
-------------------------------------------------------------
| disagree  |    296    |     0     |    155    |    246    |
-------------------------------------------------------------
|  discuss  |    657    |     0     |   3042    |    765    |
-------------------------------------------------------------
| unrelated |    18     |     0     |    133    |   18198   |
-------------------------------------------------------------
Score: 9028.5 out of 11651.25	(77.48953974895397%)


77.48953974895397