In [1]:
import keras
keras.__version__

Using TensorFlow backend.


'2.2.0'

In [2]:
import pandas as pd
import re
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
import gensim

In [3]:
train = pd.read_csv('train_data.csv')
valid = pd.read_csv('validation_data.csv')
test = pd.read_csv('test_data.csv')
body = pd.read_csv('article_body_texts.csv')
train['class'] = train['Stance'].map({'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3})
valid['class'] = valid['Stance'].map({'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3})

body['articleBody'] = body['articleBody'].str.lower()
test['Headline'] = test['Headline'].str.lower()
valid['Headline'] = valid['Headline'].str.lower()
train['Headline'] = train['Headline'].str.lower()


body['articleBody'] = body['articleBody'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
train['Headline'] = train['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
valid['Headline'] = valid['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
test['Headline'] = test['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))

body["articleBody"] = body['articleBody'].str.replace('[^\w\s]',' ')
test["Headline"] = test['Headline'].str.replace('[^\w\s]',' ')
valid["Headline"] = valid['Headline'].str.replace('[^\w\s]',' ')
train["Headline"] = train['Headline'].str.replace('[^\w\s]',' ')


full_train = pd.merge(train, body, on='Body ID')
full_valid = pd.merge(valid, body, on = 'Body ID')
full_test = pd.merge(test, body, on='Body ID')

In [4]:
full_data = full_train.append([full_valid, full_test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [5]:
%%time
from nltk.tokenize import word_tokenize
question_list = list(full_data['Headline'])# + list(full_data['articleBody'])
question_list = [' '.join(word_tokenize(q)[:17]) for q in question_list]

CPU times: user 10.7 s, sys: 39 ms, total: 10.7 s
Wall time: 11 s


In [6]:
%%time
body_list = list(full_data["articleBody"])
body_list = [' '.join(word_tokenize(q)[:685]) for q in body_list]

CPU times: user 2min 4s, sys: 231 ms, total: 2min 4s
Wall time: 2min 8s


In [7]:
whole_list = question_list + body_list

In [8]:
# Filters - removed '?' 
tokenizer = Tokenizer(num_words=30000, filters='!"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n') 
tokenizer.fit_on_texts(whole_list)

print("Number of words in vocabulary:", len(tokenizer.word_index))

Number of words in vocabulary: 27451


In [9]:
word_index = {k: v for k, v in tokenizer.word_index.items() if v < 30000}
idx_to_word = dict((v,k) for k,v in word_index.items())

In [10]:
X = tokenizer.texts_to_sequences(whole_list)
X = pad_sequences(X, padding='post', truncating='post')

In [11]:
X_headline = X[:len(X)//2]
X_article = X[len(X)//2:]

In [12]:
X_train_headline = X_headline[:len(full_train)]
X_valid_headline = X_headline[len(full_train):len(full_train) + len(full_valid)]
X_test_headline = X_headline[len(full_train) + len(full_valid):]

In [13]:
X_train_article = X_article[:len(full_train)]
X_valid_article = X_article[len(full_train):len(full_train) + len(full_valid)]
X_test_article = X_article[len(full_train) + len(full_valid):]

In [14]:
X_train_article.shape

(66677, 695)

In [16]:
import numpy as np
embeddings = {}
EMB_DIR = 'glove.6B.300d.txt.word2vec'
f = open(EMB_DIR)
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embeddings[word] = vector
f.close()

In [18]:
import numpy as np
embeddings_matrix = np.random.uniform(-0.25, 0.25, size=(len(word_index)+1, 300))

for word, i in word_index.items(): # i=0 is the embedding for the zero padding
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector

In [19]:
del embeddings

In [20]:
extra_features = pd.read_csv("full_stances.csv")

In [21]:
valid_data = pd.read_csv('validation_data_2.csv')
validation_extra = valid_data[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()
training_extra = extra_features[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()

In [22]:
test_extra_features = pd.read_csv("test_data_preprocessing.csv")
test_extra = test_extra_features[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()

In [23]:
import pandas as pd
Y = pd.get_dummies(full_data['class']).values

Y_train = Y[:len(full_train)]
Y_valid = Y[len(full_train):len(full_train) + len(full_valid)]
Y_test = Y[len(full_train) + len(full_valid):]

In [24]:
from __future__ import print_function
from random import random
from numpy import array
from numpy import cumsum
from keras.optimizers import SGD, Adam
from keras.models import Sequential
from keras.layers import LSTM, Embedding
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Bidirectional, Concatenate, concatenate, Dropout, Input, TimeDistributed, Flatten
from keras.layers import Convolution1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Model
import numpy as np
import pickle

In [27]:
EMBEDDING_DIM = 300
emb_look_up = Embedding(input_dim=(27452),
                        output_dim=EMBEDDING_DIM,
                        weights = [embeddings_matrix], 
                        trainable=False, 
                        mask_zero=False,
                        name='q_embedding_lookup')

In [134]:
from keras.layers import Lambda
from keras import backend as K
from keras.layers.normalization import BatchNormalization
n_classes = 4
MAX_SENT_LEN_HEADLINE = 695
MAX_SENT_LEN_BODY = 695
emb_size = EMBEDDING_DIM
n_hidden = 128 
width = 3

#ConvH
convh_0  = Convolution1D(filters = n_hidden, kernel_size = width, activation='relu', padding='same')
convh_1  = Convolution1D(filters = n_hidden, kernel_size = width, activation='relu', padding='same')
convh_2  = Convolution1D(filters = 2*n_hidden, kernel_size = width, activation='relu', padding='same')
convh_3  = Convolution1D(filters = 2*n_hidden, kernel_size = width, activation='relu', padding='same')
convh_4  = Convolution1D(filters = 3*n_hidden, kernel_size = width, activation='relu', padding='same')

#ConvB
convb_0  = Convolution1D(filters = n_hidden, kernel_size = width, activation='relu', padding='same')
convb_1  = Convolution1D(filters = n_hidden, kernel_size = width, activation='relu', padding='same')
convb_2  = Convolution1D(filters = 2*n_hidden, kernel_size = width, activation='relu', padding='same')
convb_3  = Convolution1D(filters = 2*n_hidden, kernel_size = width, activation='relu', padding='same')
convb_4  = Convolution1D(filters = 3*n_hidden, kernel_size = width, activation='relu', padding='same')

#Dense
dense_0 = Dense(n_hidden*4, activation='relu')
dense_1 = Dense(n_hidden*4, activation='relu')
dense_2 = Dense(n_hidden*2, activation='relu')
dense_f = Dense(n_classes, activation='softmax', name='out')

#Dropout
dperc = 0.5

#DropH
droph_0 = Dropout(dperc)
droph_1 = Dropout(dperc)
droph_2 = Dropout(dperc)
droph_3 = Dropout(dperc)
droph_4 = Dropout(dperc)

#DropB
dropb_0 = Dropout(dperc)
dropb_1 = Dropout(dperc)
dropb_2 = Dropout(dperc)
dropb_3 = Dropout(dperc)
dropb_4 = Dropout(dperc)

#PoolingH
poolh_0 = MaxPooling1D(pool_size=3, padding='same')
poolh_1 = MaxPooling1D(pool_size=3, padding='same')
poolh_2 = MaxPooling1D(pool_size=3, padding='same')

#PoolingB
poolb_0 = MaxPooling1D(pool_size=3, padding='same')
poolb_1 = MaxPooling1D(pool_size=3, padding='same')
poolb_2 = MaxPooling1D(pool_size=3, padding='same')

#Input formats
input_title   = Input(shape=(MAX_SENT_LEN_HEADLINE, ), dtype='int32', name='input_title')
input_body   = Input(shape=(MAX_SENT_LEN_BODY, ), dtype='int32', name='input_body')

otherInp = Input(shape = (4,), dtype='float32', name='input_extra')

#Create Layers - Title
x = input_title
x = emb_look_up(x)
x = convh_0(x)
x = BatchNormalization()(x)
x = droph_0(x)
x = poolh_0(x)

x = convh_3(x)
x = BatchNormalization()(x)
x = droph_3(x)

x = convh_4(x)
x = BatchNormalization()(x)
x = droph_4(x)


y = input_body
y = emb_look_up(y)
y = convb_0(y)
y = BatchNormalization()(y)
y = dropb_0(y)
y = poolb_0(y)

y = convb_3(y)
y = BatchNormalization()(y)
y = dropb_3(y)

y = convb_4(y)
x = BatchNormalization()(x)
y = dropb_4(y)

x = GlobalMaxPooling1D()(x)
y = GlobalMaxPooling1D()(y)


z = concatenate([x, y])
z = dense_0(z)

z = dense_2(z)
z = BatchNormalization()(z)

z = concatenate([z, otherInp])
out = dense_f(z)

model = Model(inputs=[input_title, input_body, otherInp], outputs=[out])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_title (InputLayer)        (None, 695)          0                                            
__________________________________________________________________________________________________
q_embedding_lookup (Embedding)  (None, 695, 300)     8235600     input_title[0][0]                
                                                                 input_body[0][0]                 
__________________________________________________________________________________________________
conv1d_31 (Conv1D)              (None, 695, 128)     115328      q_embedding_lookup[0][0]         
__________________________________________________________________________________________________
input_body (InputLayer)         (None, 695)          0                                            
__________

after training around 20 epochs

In [26]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

history = model.fit(x = [X_train_headline, X_train_article, training_extra], 
          y = Y_train, 
          shuffle = True,
          batch_size=64,
          epochs=1, 
          validation_data=([X_valid_headline, X_valid_article, validation_extra], Y_valid))

Train on 66677 samples, validate on 2438 samples
Epoch 1/1


In [52]:
result_valid = model.predict([X_valid_headline, X_valid_article, validation_extra])

In [41]:
from collections import Counter
def get_accuracy(predict_result, y_test):
    count = 0
    for i in range(0, len(predict_result)):
        if predict_result[i] == y_test[i] and y_test[i] == 3:
            count = count + 0.25
        elif y_test[i] != 3 and predict_result[i] != 3 and y_test[i] == predict_result[i]:
            count = count + 1
        elif y_test[i] != 3 and predict_result[i] != 3 and y_test[i] != predict_result[i]:
            count += 0.25
    a1 = Counter(y_test)
    total_score = a1[3] * 0.25 + (a1[0] + a1[1] + a1[2]) * 1
    accuracy = count / total_score
    return accuracy

In [53]:
from collections import Counter
Y_valid_predict = []
for i in range(len(result_valid)):
    p = max(result_valid[i])
    Y_valid_predict.append(list(result_valid[i]).index(p))

In [54]:
Y = full_data['class'].values
Y_valid_real = Y[len(full_train):len(full_train) + len(full_valid)]

In [55]:
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y_valid_real, Y_valid_predict)

0.911402789171452

In [56]:
import collections
print(collections.Counter(Y_valid_predict))

Counter({3: 1660, 2: 586, 0: 121, 1: 71})


In [57]:
print(collections.Counter(Y_valid_real))

Counter({3.0: 1746, 2.0: 476, 0.0: 142, 1.0: 74})


In [58]:
get_accuracy(Y_valid_predict, Y_valid_real)

0.8965440850686752

In [59]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_valid_real, Y_valid_predict)

array([[  79,   18,   44,    1],
       [  28,   36,    6,    4],
       [  11,    5,  456,    4],
       [   3,   12,   80, 1651]])

In [60]:
from sklearn.metrics import f1_score
f1_score(Y_valid_real, Y_valid_predict, average=None)

array([0.60076046, 0.49655172, 0.85875706, 0.96946565])

In [61]:
f1_score(Y_valid_real, Y_valid_predict, average = "macro")

0.7313837228533875

In [62]:
#save the model
model_json = model.to_json()
with open("1D_CNN_8965.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("1D_CNN_8965.h5")