In [1]:
import pandas as pd
import re
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
import gensim

Using TensorFlow backend.


In [2]:
train = pd.read_csv('train_data.csv')
valid = pd.read_csv('validation_data.csv')
test = pd.read_csv('test_data.csv')
body = pd.read_csv('article_body_texts.csv')
train['class'] = train['Stance'].map({'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3})
valid['class'] = valid['Stance'].map({'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3})

body['articleBody'] = body['articleBody'].str.lower()
test['Headline'] = test['Headline'].str.lower()
valid['Headline'] = valid['Headline'].str.lower()
train['Headline'] = train['Headline'].str.lower()


body['articleBody'] = body['articleBody'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
train['Headline'] = train['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
valid['Headline'] = valid['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
test['Headline'] = test['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))

body["articleBody"] = body['articleBody'].str.replace('[^\w\s]',' ')
test["Headline"] = test['Headline'].str.replace('[^\w\s]',' ')
valid["Headline"] = valid['Headline'].str.replace('[^\w\s]',' ')
train["Headline"] = train['Headline'].str.replace('[^\w\s]',' ')

In [3]:
full_train = pd.merge(train, body, on='Body ID')
full_valid = pd.merge(valid, body, on = 'Body ID')
full_test = pd.merge(test, body, on='Body ID')

In [4]:
full_data = full_train.append([full_valid, full_test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [5]:
%%time
from nltk.tokenize import word_tokenize
question_list = list(full_data['Headline'])# + list(full_data['articleBody'])
question_list = [' '.join(word_tokenize(q)[:17]) for q in question_list]

CPU times: user 11.7 s, sys: 56.1 ms, total: 11.8 s
Wall time: 12.5 s


In [6]:
%%time
body_list = list(full_data["articleBody"])
body_list = [' '.join(word_tokenize(q)[:200]) for q in body_list]

CPU times: user 2min 6s, sys: 373 ms, total: 2min 7s
Wall time: 2min 7s


In [7]:
whole_list = question_list + body_list

In [8]:
tokenizer = Tokenizer(num_words=30000, filters='!"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n') 
tokenizer.fit_on_texts(whole_list)

print("Number of words in vocabulary:", len(tokenizer.word_index))

Number of words in vocabulary: 19942


In [9]:
word_index = {k: v for k, v in tokenizer.word_index.items() if v < 30000}
idx_to_word = dict((v,k) for k,v in word_index.items())

In [10]:
X = tokenizer.texts_to_sequences(whole_list)
X = pad_sequences(X, padding='post', truncating='post')

In [11]:
X_headline = X[:len(X)//2]
X_article = X[len(X)//2:]

X_train_headline = X_headline[:len(full_train)]
X_valid_headline = X_headline[len(full_train):len(full_train) + len(full_valid)]
X_test_headline = X_headline[len(full_train) + len(full_valid):]

X_train_article = X_article[:len(full_train)]
X_valid_article = X_article[len(full_train):len(full_train) + len(full_valid)]
X_test_article = X_article[len(full_train) + len(full_valid):]

In [12]:
import numpy as np
embeddings = {}
EMB_DIR = 'glove.6B.300d.txt.word2vec'
f = open(EMB_DIR)
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embeddings[word] = vector
f.close()

In [13]:
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(word_index)+1, 300))

for word, i in word_index.items():
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector
        
del embeddings

In [14]:
valid_data = pd.read_csv('validation_data_2.csv')
extra_features = pd.read_csv('full_stances.csv')
validation_extra = valid_data[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()
training_extra = extra_features[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()

In [15]:
test_extra_features = pd.read_csv("test_validation.csv")
test_extra = test_extra_features[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()

In [22]:
import pandas as pd
Y = pd.get_dummies(full_data['class']).values

Y_train = Y[:len(full_train)]
Y_valid = Y[len(full_train):len(full_train) + len(full_valid)]
Y_test = Y[len(full_train) + len(full_valid):]

Building GRU model

In [62]:
LSTM_DIM = 64
EMBEDDING_DIM = 300
from keras.layers import *
from keras.layers import Dropout
import tensorflow as tf
from keras.models import Model

otherInp = Input(shape = (4, ), name='extra_features')

article_input = Input(shape=(207, ), name='article_input')
embed = Embedding(input_dim=19943,
                          output_dim=EMBEDDING_DIM,
                          trainable=False, name='word_embedding_layer1', 
                          mask_zero=True)
article_input_1 = embed(article_input)
first_lstm = Bidirectional(GRU(LSTM_DIM, return_state = True, name='lstm_layer11', dropout=0.3, recurrent_dropout=0.3))
article_outputs, forward_h, backward_h = first_lstm(article_input_1)

In [63]:
headline_input = Input(shape=(207, ), name='headline_input')
embed = Embedding(input_dim=19943,
                          output_dim=EMBEDDING_DIM,
                          trainable=False, name='word_embedding_layer2', 
                          mask_zero=True)
headline_input_2 = embed(headline_input)
second_lstm = Bidirectional(GRU(LSTM_DIM, return_state = False, name='lstm_layer12', dropout=0.3, recurrent_dropout=0.3))
state_h = [forward_h, backward_h]
dec_outputs = second_lstm(headline_input_2, initial_state = state_h)

In [64]:
mergedOut = Concatenate()([article_outputs, dec_outputs, otherInp])
mergedOut = BatchNormalization()(mergedOut)

mergedOut = Dense(64, activation='relu', kernel_regularizer = regularizers.l2(0.0015))(mergedOut)
mergedOut = Dropout(0.25)(mergedOut)
mergedOut = BatchNormalization()(mergedOut)

mergedOut = Dense(4, activation='softmax', kernel_regularizer = regularizers.l2(0.0015))(mergedOut)
gru = Model([article_input, headline_input, otherInp], mergedOut)

In [65]:
gru.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

gru.fit(x = [X_train_article, X_train_headline, training_extra], 
          y = Y_train, 
          shuffle = True,
          batch_size=64, 
          epochs=3,
          validation_data=([X_valid_article, X_valid_headline, validation_extra], Y_valid))

Train on 66677 samples, validate on 2438 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb63773b978>

In [76]:
gru.fit(x = [X_train_article, X_train_headline, training_extra], 
          y = Y_train, 
          shuffle = True,
          batch_size=64, 
          epochs=1,
          validation_data=([X_valid_article, X_valid_headline, validation_extra], Y_valid))

Train on 66677 samples, validate on 2438 samples
Epoch 1/1


<keras.callbacks.History at 0x7fb6343ff518>

In [77]:
result_valid = gru.predict([X_valid_headline, X_valid_article, validation_extra])

In [78]:
from collections import Counter
Y_valid_predict = []
for i in range(len(result_valid)):
    p = max(result_valid[i])
    Y_valid_predict.append(list(result_valid[i]).index(p))

In [68]:
Y = full_data['class'].values
Y_valid_real = Y[len(full_train):len(full_train) + len(full_valid)]

In [80]:
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y_valid_real, Y_valid_predict)

0.8773584905660378

In [81]:
import collections
print(collections.Counter(Y_valid_predict))

Counter({3: 1745, 2: 564, 0: 118, 1: 11})


In [82]:
print(collections.Counter(Y_valid_real))

Counter({3.0: 1746, 2.0: 476, 0.0: 142, 1.0: 74})


In [83]:
get_accuracy(Y_valid_predict, Y_valid_real)

0.8143553389455028

In [84]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_valid_real, Y_valid_predict)

array([[  57,    3,   70,   12],
       [  14,    0,   47,   13],
       [  44,    6,  394,   32],
       [   3,    2,   53, 1688]])

In [85]:
from sklearn.metrics import f1_score
f1_score(Y_valid_real, Y_valid_predict, average=None)

array([0.43846154, 0.        , 0.75769231, 0.96705815])

In [86]:
f1_score(Y_valid_real, Y_valid_predict, average = "macro")

0.5408029989203006

Bi-directional LSTM

In [88]:
LSTM_DIM = 64
EMBEDDING_DIM = 300
from keras.layers import *
from keras.layers import Dropout
from keras import regularizers
import keras.regularizers
import tensorflow as tf
from keras.models import Sequential
m1 = Sequential(layers=[
    Embedding(input_dim=19943,  
                          output_dim=EMBEDDING_DIM,
                          weights = [embeddings_matrix], trainable=False, name='word_embedding_layer1', 
                          mask_zero=True),
    Bidirectional(LSTM(LSTM_DIM, return_sequences=False, return_state = False, name='lstm_layer11')),
    Dropout(rate=0.2, name='Headline_dropout1')
])

In [89]:
m2 = Sequential(layers=[
    Embedding(input_dim=19943,
                          output_dim=EMBEDDING_DIM,
                          weights = [embeddings_matrix], trainable=False, name='word_embedding_layer2', 
                          mask_zero=True),
    Bidirectional(LSTM(LSTM_DIM, return_sequences=False, name='lstm_layer21')),
    Dropout(rate=0.2, name='Body_dropout1')
])

In [90]:
from keras.layers import * 
from keras.models import Model
from keras.layers import Concatenate
mergedOut = Concatenate()([m1.output, m2.output])

mergedOut = Dense(32, activation='relu', kernel_regularizer = regularizers.l2(0.001))(mergedOut)
mergedOut = Dense(4, activation='softmax', kernel_regularizer = regularizers.l2(0.001))(mergedOut)
lstm = Model([m1.input, m2.input], mergedOut)

In [91]:
lstm.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_embedding_layer1_input (In (None, None)         0                                            
__________________________________________________________________________________________________
word_embedding_layer2_input (In (None, None)         0                                            
__________________________________________________________________________________________________
word_embedding_layer1 (Embeddin (None, None, 300)    5982900     word_embedding_layer1_input[0][0]
__________________________________________________________________________________________________
word_embedding_layer2 (Embeddin (None, None, 300)    5982900     word_embedding_layer2_input[0][0]
__________________________________________________________________________________________________
bidirectio