In [1]:
import keras
keras.__version__

Using TensorFlow backend.


'2.2.0'

In [2]:
import pandas as pd
import re
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
import gensim

In [3]:
train = pd.read_csv('train_data.csv')
valid = pd.read_csv('validation_data.csv')
test = pd.read_csv('test_data.csv')
body = pd.read_csv('article_body_texts.csv')
train['class'] = train['Stance'].map({'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3})
valid['class'] = valid['Stance'].map({'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3})

body['articleBody'] = body['articleBody'].str.lower()
test['Headline'] = test['Headline'].str.lower()
valid['Headline'] = valid['Headline'].str.lower()
train['Headline'] = train['Headline'].str.lower()


body['articleBody'] = body['articleBody'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
train['Headline'] = train['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
valid['Headline'] = valid['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
test['Headline'] = test['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))

body["articleBody"] = body['articleBody'].str.replace('[^\w\s]',' ')
test["Headline"] = test['Headline'].str.replace('[^\w\s]',' ')
valid["Headline"] = valid['Headline'].str.replace('[^\w\s]',' ')
train["Headline"] = train['Headline'].str.replace('[^\w\s]',' ')


full_train = pd.merge(train, body, on='Body ID')
full_valid = pd.merge(valid, body, on = 'Body ID')
full_test = pd.merge(test, body, on='Body ID')

In [4]:
full_data = full_train.append([full_valid, full_test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [5]:
%%time
from nltk.tokenize import word_tokenize
question_list = list(full_data['Headline'])# + list(full_data['articleBody'])
question_list = [' '.join(word_tokenize(q)[:17]) for q in question_list]

CPU times: user 10.7 s, sys: 39 ms, total: 10.7 s
Wall time: 11 s


In [6]:
%%time
body_list = list(full_data["articleBody"])
body_list = [' '.join(word_tokenize(q)[:685]) for q in body_list]

CPU times: user 2min 4s, sys: 231 ms, total: 2min 4s
Wall time: 2min 8s


In [7]:
whole_list = question_list + body_list

In [8]:
# Filters - removed '?' 
tokenizer = Tokenizer(num_words=30000, filters='!"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n') 
tokenizer.fit_on_texts(whole_list)

print("Number of words in vocabulary:", len(tokenizer.word_index))

Number of words in vocabulary: 27451


In [9]:
word_index = {k: v for k, v in tokenizer.word_index.items() if v < 30000}
idx_to_word = dict((v,k) for k,v in word_index.items())

In [10]:
X = tokenizer.texts_to_sequences(whole_list)
X = pad_sequences(X, padding='post', truncating='post')

In [11]:
X_headline = X[:len(X)//2]
X_article = X[len(X)//2:]

In [12]:
X_train_headline = X_headline[:len(full_train)]
X_valid_headline = X_headline[len(full_train):len(full_train) + len(full_valid)]
X_test_headline = X_headline[len(full_train) + len(full_valid):]

In [13]:
X_train_article = X_article[:len(full_train)]
X_valid_article = X_article[len(full_train):len(full_train) + len(full_valid)]
X_test_article = X_article[len(full_train) + len(full_valid):]

In [14]:
X_train_article.shape

(66677, 695)

In [16]:
import numpy as np
embeddings = {}
EMB_DIR = 'glove.6B.300d.txt.word2vec'
f = open(EMB_DIR)
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embeddings[word] = vector
f.close()

In [18]:
import numpy as np
embeddings_matrix = np.random.uniform(-0.25, 0.25, size=(len(word_index)+1, 300))

for word, i in word_index.items(): # i=0 is the embedding for the zero padding
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector

In [19]:
del embeddings

In [20]:
extra_features = pd.read_csv("full_stances.csv")

In [21]:
valid_data = pd.read_csv('validation_data_2.csv')
validation_extra = valid_data[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()
training_extra = extra_features[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()

In [22]:
test_extra_features = pd.read_csv("test_data_preprocessing.csv")
test_extra = test_extra_features[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()

In [23]:
import pandas as pd
Y = pd.get_dummies(full_data['class']).values

Y_train = Y[:len(full_train)]
Y_valid = Y[len(full_train):len(full_train) + len(full_valid)]
Y_test = Y[len(full_train) + len(full_valid):]

In [24]:
from __future__ import print_function
from random import random
from numpy import array
from numpy import cumsum
from keras.optimizers import SGD, Adam
from keras.models import Sequential
from keras.layers import LSTM, Embedding
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Bidirectional, Concatenate, concatenate, Dropout, Input, TimeDistributed, Flatten
from keras.layers import Convolution1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Model
import numpy as np
import pickle

In [27]:
EMBEDDING_DIM = 300
emb_look_up = Embedding(input_dim=(27452),
                        output_dim=EMBEDDING_DIM,
                        weights = [embeddings_matrix], 
                        trainable=False, 
                        mask_zero=False,
                        name='q_embedding_lookup')

cnn with simple attention mechanism
(credit to: https://github.com/GauravBh1010tt/DeepLearn)

In [29]:
import numpy as np
import pandas as pd
from keras.utils.np_utils import to_categorical
import pandas as pd
from keras.layers.convolutional import Convolution1D
import warnings
from nltk.tokenize import regexp_tokenize
import numpy as np
import gensim as gen
import keras.backend as K
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Dense, Layer,Lambda, Dropout, Activation, Input, Concatenate, Multiply
from keras.layers import Embedding, BatchNormalization
from keras.layers import Conv1D, GlobalMaxPooling1D
from sklearn.ensemble import GradientBoostingClassifier

In [30]:
class Abs(Layer):
    def __init__(self, **kwargs):
        super(Abs, self).__init__(**kwargs)

    def call(self, x, mask=None):
        return K.abs(x[0]- x[1])

    def get_output_shape_for(self, input_shape):
        return input_shape

In [36]:
embedding_dim = 300
LSTM_neurons = 64 
dense_neuron = 32 
dimx = 695
dimy = 695
lamda = 0.0
nb_filter = 100
filter_length = 4
vocab_size = 10000
batch_size = 64 
epochs = 3
ntn_out = 16
ntn_in = nb_filter 
state = False
    
inpx   = Input(shape=(dimx, ), dtype='int32', name='input_title')
inpy   = Input(shape=(dimy, ), dtype='int32', name='input_body')

EMBEDDING_DIM = 300
emb_look_up = Embedding(input_dim=(27452),
                        output_dim=EMBEDDING_DIM,
                        weights = [embeddings_matrix], 
                        trainable=False, 
                        mask_zero=False,
                        name='q_embedding_lookup')
x = inpx
x = emb_look_up(x)
    
y = inpy
y = emb_look_up(y)
    
ques = Convolution1D(nb_filter=nb_filter, filter_length=filter_length,
                         border_mode='valid', activation='relu',
                         subsample_length=1)(x)

ques = BatchNormalization()(ques)
                            
ans = Convolution1D(nb_filter=nb_filter, filter_length=filter_length,
                        border_mode='valid', activation='relu',
                        subsample_length=1)(y)

ans = BatchNormalization()(ans)

hx = GlobalMaxPooling1D()(ques)
hy = GlobalMaxPooling1D()(ans)

    
h1 = Multiply()([hx,hy])
h2 = Abs()([hx,hy])
h = Concatenate()([h1, h2])

h = Dense(dense_neuron, activation='relu',name='wrap')(h)
h = BatchNormalization()(h)

score = Dense(4,activation='softmax',name='score')(h)
model = Model( [inpx, inpy, otherInp],score)



In [37]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

In [38]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_title (InputLayer)        (None, 695)          0                                            
__________________________________________________________________________________________________
input_body (InputLayer)         (None, 695)          0                                            
__________________________________________________________________________________________________
q_embedding_lookup (Embedding)  (None, 695, 300)     8235600     input_title[0][0]                
                                                                 input_body[0][0]                 
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 692, 100)     120100      q_embedding_lookup[0][0]         
__________

In [64]:
model.fit(x = [X_train_headline, X_train_article, training_extra], 
          y = Y_train, 
          shuffle = True,
          batch_size=64, 
          epochs=2,
          #verbose=2,
          validation_data=([X_valid_headline, X_valid_article, validation_extra], Y_valid))

Train on 66677 samples, validate on 2438 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f0e82a95198>

In [34]:
from collections import Counter
def get_accuracy(predict_result, y_test):
    count = 0
    for i in range(0, len(predict_result)):
        if predict_result[i] == y_test[i] and y_test[i] == 3:
            count = count + 0.25
        elif y_test[i] != 3 and predict_result[i] != 3 and y_test[i] == predict_result[i]:
            count = count + 1
        elif y_test[i] != 3 and predict_result[i] != 3 and y_test[i] != predict_result[i]:
            count += 0.25
    a1 = Counter(y_test)
    total_score = a1[3] * 0.25 + (a1[0] + a1[1] + a1[2]) * 1
    accuracy = count / total_score
    return accuracy

In [66]:
result_valid = model.predict([X_valid_headline, X_valid_article, validation_extra])

In [67]:
from collections import Counter
Y_valid_predict = []
for i in range(len(result_valid)):
    p = max(result_valid[i])
    Y_valid_predict.append(list(result_valid[i]).index(p))

In [68]:
Y = full_data['class'].values
Y_valid_real = Y[len(full_train):len(full_train) + len(full_valid)]

In [69]:
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y_valid_real, Y_valid_predict)

0.8679245283018868

In [70]:
import collections
print(collections.Counter(Y_valid_predict))

Counter({3: 1806, 2: 446, 0: 118, 1: 68})


In [71]:
print(collections.Counter(Y_valid_real))

Counter({3.0: 1746, 2.0: 476, 0.0: 142, 1.0: 74})


In [72]:
get_accuracy(Y_valid_predict, Y_valid_real)

0.7899867080194949

In [73]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_valid_real, Y_valid_predict)

array([[  68,   16,   26,   32],
       [  13,   27,   11,   23],
       [  18,   22,  353,   83],
       [  19,    3,   56, 1668]])

In [74]:
from sklearn.metrics import f1_score
f1_score(Y_valid_real, Y_valid_predict, average=None)

array([0.52307692, 0.38028169, 0.76572668, 0.93918919])

In [75]:
f1_score(Y_valid_real, Y_valid_predict, average = "macro")

0.652068620883735

Bi-directional LSTM with simple attention mechanism

In [107]:
embedding_dim = 300
LSTM_neurons = 50
dense_neuron = 16
lamda = 0.0
nb_filter = 100
filter_length = 4
batch_size = 50
epochs = 5
ntn_out = 16
ntn_in = nb_filter 
state = False

shared_lstm = Bidirectional(LSTM(LSTM_neurons,return_sequences=True),merge_mode='sum')   
hx = shared_lstm(x)
hy = shared_lstm(y)

h1 = Flatten()(hx)
h2 = Flatten()(hy)

hx1 = Multiply()([h1,h2])
hx2 = Abs()([h1,h2])

h = Concatenate()([hx1, hx2])

wrap = Dense(dense_neuron, activation='relu',name='wrap')(h)
score = Dense(4,activation='softmax',name='score')(wrap)
model = Model( [inpx,inpy],score)   

In [108]:
lstm_model.compile( loss='categorical_crossentropy',optimizer="adadelta",metrics=['accuracy'])

In [122]:
lstm_model.fit(x = [X_train_headline, X_train_article], 
          y = Y_train, 
          shuffle = True,
          batch_size=batch_size,
          epochs=1,
          validation_data=([X_valid_headline, X_valid_article], Y_valid))

Train on 66677 samples, validate on 2438 samples
Epoch 1/1


<keras.callbacks.History at 0x7fabbb882400>