In [1]:
from keras import backend as K

Using TensorFlow backend.


In [2]:
import pandas as pd
import re
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
import gensim

In [3]:
train = pd.read_csv('train_data.csv')
valid = pd.read_csv('validation_data.csv')
test = pd.read_csv('test_data.csv')
body = pd.read_csv('article_body_texts.csv')
train['class'] = train['Stance'].map({'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3})
valid['class'] = valid['Stance'].map({'agree':0, 'disagree':1, 'discuss':2, 'unrelated':3})

body['articleBody'] = body['articleBody'].str.lower()
test['Headline'] = test['Headline'].str.lower()
valid['Headline'] = valid['Headline'].str.lower()
train['Headline'] = train['Headline'].str.lower()


body['articleBody'] = body['articleBody'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
train['Headline'] = train['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
valid['Headline'] = valid['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))
test['Headline'] = test['Headline'].apply((lambda x: re.sub('[^a-zA-z0-9\\s]',' ',x)))

body["articleBody"] = body['articleBody'].str.replace('[^\w\s]',' ')
test["Headline"] = test['Headline'].str.replace('[^\w\s]',' ')
valid["Headline"] = valid['Headline'].str.replace('[^\w\s]',' ')
train["Headline"] = train['Headline'].str.replace('[^\w\s]',' ')


full_train = pd.merge(train, body, on='Body ID')
full_valid = pd.merge(valid, body, on = 'Body ID')
full_test = pd.merge(test, body, on='Body ID')

In [4]:
full_data = full_train.append([full_valid, full_test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [5]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
import numpy as np
word_seq = [text_to_word_sequence(sent) for sent in full_data['Headline']]
print('90th Percentile Sentence Length:', np.percentile([len(seq) for seq in word_seq], 90))

90th Percentile Sentence Length: 17.0


In [6]:
word_seq2 = [text_to_word_sequence(sent) for sent in full_data['articleBody']]
print('90th Percentile Sentence Length:', np.percentile([len(seq) for seq in word_seq2], 90))

90th Percentile Sentence Length: 685.0


In [7]:
%%time
from nltk.tokenize import word_tokenize
question_list = list(full_data['Headline'])
question_list = [' '.join(word_tokenize(q)[:17]) for q in question_list]

CPU times: user 7.24 s, sys: 220 ms, total: 7.46 s
Wall time: 9.42 s


In [8]:
%%time
body_list = list(full_data["articleBody"])
body_list = [' '.join(word_tokenize(q)[:685]) for q in body_list]

CPU times: user 1min 10s, sys: 55.6 ms, total: 1min 10s
Wall time: 1min 10s


In [9]:
whole_list = question_list + body_list

In [10]:
# Filters - removed '?' 
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n') 
tokenizer.fit_on_texts(whole_list)

print("Number of words in vocabulary:", len(tokenizer.word_index))

Number of words in vocabulary: 27451


In [11]:
word_index = {k: v for k, v in tokenizer.word_index.items() if v < 30000}
idx_to_word = dict((v, k) for k,v in word_index.items())

In [12]:
X = tokenizer.texts_to_sequences(whole_list)
X = pad_sequences(X, padding='post', truncating='post')

In [13]:
X_headline = X[:len(X)//2]
X_article = X[len(X)//2:]

In [14]:
X_train_headline = X_headline[:len(full_train)]
X_valid_headline = X_headline[len(full_train):len(full_train) + len(full_valid)]
X_test_headline = X_headline[len(full_train) + len(full_valid):]

In [15]:
X_train_article = X_article[:len(full_train)]
X_valid_article = X_article[len(full_train):len(full_train) + len(full_valid)]
X_test_article = X_article[len(full_train) + len(full_valid):]

In [16]:
embeddings = {}
EMB_DIR = 'glove.6B.300d.txt.word2vec'
f = open(EMB_DIR)
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embeddings[word] = vector
f.close()

In [17]:
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(word_index)+1, 300))

for word, i in word_index.items():
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector

In [18]:
del embeddings

In [19]:
extra_features = pd.read_csv("full_stances.csv")

In [20]:
valid_data = pd.read_csv('validation_data_2.csv')
validation_extra = valid_data[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()
training_extra = extra_features[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()

In [21]:
test_extra_features = pd.read_csv("test_data_preprocessing.csv")
test_extra = test_extra_features[['bigram_ratio', 'unigram_ratio', 'trigram_ratio', 'tfidf_similarity']].copy()

In [22]:
import pandas as pd
Y = pd.get_dummies(full_data['class']).values

Y_train = Y[:len(full_train)]
Y_valid = Y[len(full_train):len(full_train) + len(full_valid)]
Y_test = Y[len(full_train) + len(full_valid):]

In [30]:
from keras.models import Model
from keras.layers import Layer, Input, Dense, Concatenate, Conv2D, Reshape, MaxPooling1D, Flatten, BatchNormalization, Activation, Dropout, Embedding

bi_filter_size = 2
tri_filter_size = 3

num_filters = 40

MAX_VOCAB_SIZE = 30000
MAX_SENT_LEN_HEADLINE = 695
MAX_SENT_LEN_BODY = 695
EMBEDDING_DIM = 300
BATCH_SIZE = 32
N_EPOCHS = 1

credit to: kathihareesh

In [33]:
input_1 = Input(shape=(MAX_SENT_LEN_HEADLINE, ), name='q1_input')
otherInp = Input(shape = (4, ), name='extra_features')

emb_look_up = Embedding(input_dim=(27452),
                        output_dim=EMBEDDING_DIM,
                        weights = [embeddings_matrix], 
                        trainable=False, 
                        mask_zero=False,
                        name='q_embedding_lookup')

emb_1 = emb_look_up(input_1)

emb_1 = Reshape(target_shape=(1, MAX_SENT_LEN_HEADLINE, EMBEDDING_DIM), 
                name='q1_embedding_reshape')(emb_1) 

conv_1_bi =  Conv2D(filters=num_filters, 
                    kernel_size=(bi_filter_size, EMBEDDING_DIM), 
                    padding='valid', 
                    activation='relu', 
                    data_format='channels_first', 
                    name='q1_bigram_conv')(emb_1)

conv_1_tri =  Conv2D(filters=num_filters, 
                     kernel_size=(tri_filter_size, EMBEDDING_DIM), 
                     padding='valid', 
                     activation='relu', 
                     data_format='channels_first', 
                     name='q1_trigram_conv')(emb_1)

bi_out_timesteps = MAX_SENT_LEN_HEADLINE - bi_filter_size + 1 
tri_out_timesteps = MAX_SENT_LEN_HEADLINE - tri_filter_size + 1

conv_1_bi = Reshape(target_shape=(bi_out_timesteps, num_filters), 
                    name='q1_bigram_conv_reshape')(conv_1_bi) 
conv_1_tri = Reshape(target_shape=(tri_out_timesteps, num_filters),
                     name='q1_trigram_conv_reshape')(conv_1_tri)

max_pool_1_bi = MaxPooling1D(pool_size = bi_out_timesteps,
                             name='q1_bigram_maxpool')(conv_1_bi)
max_pool_1_tri = MaxPooling1D(pool_size = tri_out_timesteps,
                              name='q1_trigram_maxpool')(conv_1_tri)

merged_1 = Concatenate(name='q1_maxpool_concat')([max_pool_1_bi, max_pool_1_tri])

dropout_1 = Dropout(rate=0.2, 
                    name='q1_dropout')(merged_1)
flatten_1 = Flatten(name='q1_flatten')(dropout_1)

In [34]:
input_2 = Input(shape=(MAX_SENT_LEN_BODY, ), name='q2_input')

emb_2 = emb_look_up(input_2)
emb_2 = Reshape((1, MAX_SENT_LEN_BODY, EMBEDDING_DIM), 
                name='q2_embedding_reshape')(emb_2)

conv_2_bi =  Conv2D(filters=num_filters, 
                    kernel_size=(bi_filter_size, EMBEDDING_DIM), 
                    padding='valid', 
                    activation='relu', 
                    data_format='channels_first', 
                    name='q2_bigram_conv')(emb_2)

conv_2_tri =  Conv2D(filters=num_filters, 
                     kernel_size=(tri_filter_size, EMBEDDING_DIM), 
                     padding='valid', 
                     activation='relu', 
                     data_format='channels_first', 
                     name='q2_trigram_conv')(emb_2)

bi_out_timesteps_2 = MAX_SENT_LEN_BODY - bi_filter_size + 1
tri_out_timesteps_2 = MAX_SENT_LEN_BODY - tri_filter_size + 1 

conv_2_bi = Reshape((bi_out_timesteps_2, num_filters), 
                    name='q2_bigram_conv_reshape')(conv_2_bi) 
conv_2_tri = Reshape((tri_out_timesteps_2, num_filters), 
                     name='q2_trigram_conv_reshape')(conv_2_tri)

max_pool_2_bi = MaxPooling1D(pool_size = bi_out_timesteps_2, 
                             name='q2_bigram_maxpool')(conv_2_bi)
max_pool_2_tri = MaxPooling1D(pool_size = tri_out_timesteps_2, 
                              name='q2_trigram_maxpool')(conv_2_tri)

merged_2 = Concatenate(name='q2_maxpool_flatten')([max_pool_2_bi, max_pool_2_tri])
dropout_2 = Dropout(rate=0.2, 
                    name='q2_dropout')(merged_2)
flatten_2 = Flatten(name='q2_flatten')(dropout_2)

In [37]:
merged = Concatenate(name='q1_q2_concat')([flatten_1, flatten_2, otherInp])
dense_1 = Dense(units=10, 
                name='q1_q2_dense')(merged)
bn_1 = BatchNormalization(name='batchnorm')(dense_1)
relu_1 = Activation(activation='relu', 
                    name='relu_activation')(bn_1)
dense_1_dropout = Dropout(0.2, 
                          name='dense_dropout')(relu_1)

output_prob = Dense(units=4, 
                    activation='softmax', 
                    name='output_layer')(dense_1_dropout)
model = Model(inputs=[input_1, input_2, otherInp], outputs=output_prob, name='text_pair_cnn')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
q1_input (InputLayer)           (None, 695)          0                                            
__________________________________________________________________________________________________
q2_input (InputLayer)           (None, 695)          0                                            
__________________________________________________________________________________________________
q_embedding_lookup (Embedding)  (None, 695, 300)     8235600     q1_input[0][0]                   
                                                                 q2_input[0][0]                   
__________________________________________________________________________________________________
q1_embedding_reshape (Reshape)  (None, 1, 695, 300)  0           q_embedding_lookup[0][0]         
__________

In [38]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])

model.fit(x = [X_train_headline, X_train_article, training_extra], 
          y = Y_train, 
          shuffle = True,
          batch_size=32, 
          epochs=5, 
          validation_data=([X_valid_headline, X_valid_article, validation_extra], Y_valid))

Train on 66677 samples, validate on 2438 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2a60b9e3c8>

In [39]:
from collections import Counter
def get_accuracy_2(predict_result, y_test):
    count = 0
    for i in range(0, len(predict_result)):
        if predict_result[i] == y_test[i] and y_test[i] == 3:
            count += 0.25
        elif y_test[i] != 3:
            if predict_result[i] != 3:
                count += 0.25
                if predict_result[i] == y_test[i]:
                    count += 0.75
    a1 = Counter(y_test)
    total_score = a1[3] * 0.25 + (a1[0] + a1[1] + a1[2]) * 1
    accuracy = count / total_score
    return accuracy

In [76]:
result_valid = model.predict([X_valid_headline, X_valid_article, validation_extra])

In [77]:
Y_valid_predict = []
for i in range(len(result_valid)):
    p = max(result_valid[i])
    Y_valid_predict.append(list(result_valid[i]).index(p))

In [78]:
Y = full_data['class'].values
Y_valid_real = Y[len(full_train):len(full_train) + len(full_valid)]

In [63]:
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(Y_valid_real, Y_valid_predict)

0.9031993437243643

In [64]:
import collections
print(collections.Counter(Y_valid_predict))

Counter({3: 1753, 2: 489, 0: 194, 1: 2})


In [65]:
print(collections.Counter(valid['class']))

Counter({3: 1746, 2: 476, 0: 142, 1: 74})


In [66]:
#weigted score after training around 5 - 10 epochs
get_accuracy_2(Y_valid_predict, Y_valid_real)

0.8486929552503323

In [67]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_valid_real, Y_valid_predict)

array([[  88,    0,   47,    7],
       [  51,    1,   14,    8],
       [  53,    0,  399,   24],
       [   2,    1,   29, 1714]])

In [68]:
from sklearn.metrics import f1_score
f1_score(Y_valid_real, Y_valid_predict, average=None) 

array([0.52380952, 0.02631579, 0.82694301, 0.97970849])

In [69]:
f1_score(Y_valid_real, Y_valid_predict, average='macro') 

0.5891942016510059