# Stance Detection

## Data Loading

1. Configure Data Path
2. Load Embeddings
3. Load Train and Test Dataset

### Configure Data Path

In [1]:
import os

data_path = os.path.join(os.path.dirname(os.getcwd()), "data")

### Load Embeddings

In [2]:
from zipfile import ZipFile

with ZipFile(f'{data_path}/external/glove.6B.zip', 'r') as z:
    z.extractall()

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/apollo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Dataset

In [4]:
import pandas as pd

train_bodies = pd.read_csv(f'{data_path}/stance/train/train_bodies.csv')
train_stances = pd.read_csv(f'{data_path}/stance/train/train_stances.csv')

In [5]:
dataset = pd.merge(train_bodies[['Body ID','articleBody']],train_stances [['Body ID','Headline','Stance']],left_on = 'Body ID', right_on = 'Body ID', how = 'inner')

In [6]:
dataset.head()

Unnamed: 0,Body ID,articleBody,Headline,Stance
0,0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated
1,0,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated
2,0,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated
3,0,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Ottawa,unrelated
4,0,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated


## Data Preprocessing

1. Setting Hyperparameters
2. Tokenization
3. Encoding
4. Convert Labels to One-Hot Vectors

### Setting Hyperparameters

- `MAX_SENTS` = Maximum no.of sentences to consider in an article.
- `MAX_SENT_LENGTH` = Maximum no.of words to consider in a sentence.
- `MAX_NB_WORDS` = Maximum no.of words in the total vocabualry.
- `MAX_SENTS_HEADING` = Maximum no.of sentences to consider in a heading of an article.

In [7]:
MAX_NB_WORDS = 20000
MAX_SENTS = 20
MAX_SENTS_HEADING = 1
MAX_SENT_LENGTH = 20
VALIDATION_SPLIT = 0.25

### Tokenization

In [8]:
from keras.preprocessing.text import Tokenizer 

In [9]:
t = Tokenizer(num_words = MAX_NB_WORDS,filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}\n"~')

In [10]:
t.fit_on_texts(dataset['articleBody'])
t.fit_on_texts(dataset['Headline'])

In [11]:
vocab_size = len(t.word_index) + 1
print(vocab_size)

27879


In [12]:
from nltk import tokenize

texts = []
articles = []

for idx in range(dataset['articleBody'].shape[0]):
    text = dataset['articleBody'][idx]
    texts.append(text)
  
    sentences = tokenize.sent_tokenize(text)
    articles.append(sentences)

In [13]:
texts[0], articles[0]

('A small meteorite crashed into a wooded area in Nicaragua\'s capital of Managua overnight, the government said Sunday. Residents reported hearing a mysterious boom that left a 16-foot deep crater near the city\'s airport, the Associated Press reports. \n\nGovernment spokeswoman Rosario Murillo said a committee formed by the government to study the event determined it was a "relatively small" meteorite that "appears to have come off an asteroid that was passing close to Earth." House-sized asteroid 2014 RC, which measured 60 feet in diameter, skimmed the Earth this weekend, ABC News reports. \nMurillo said Nicaragua will ask international experts to help local scientists in understanding what happened.\n\nThe crater left by the meteorite had a radius of 39 feet and a depth of 16 feet,  said Humberto Saballos, a volcanologist with the Nicaraguan Institute of Territorial Studies who was on the committee. He said it is still not clear if the meteorite disintegrated or was buried.\n\nHumb

### Feature Encoding

In [14]:
from keras.preprocessing.text import text_to_word_sequence
import numpy as np

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype = 'int32')
for i, sentences in enumerate(articles):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
        for word in wordTokens:
            try:
                if k < MAX_SENT_LENGTH and t.word_index[word] < MAX_NB_WORDS:
                    data[i,j,k] = t.word_index[word]
                    k += 1
            except:
                pass

In [15]:
data[1, :, :]

array([[    3,   481,   427,  7211,    81,     3,  3733,   331,     5,
         3891,   350,     4,  1431,  2958,     1,    89,    12,   464,
            0,     0],
       [  758,    95,  1047,     3,  2679,  1752,     7,   189,     3,
         1217,  1075,  2030,   700,   159,     1,  3032,   448,     1,
          555,   235],
       [   89,  1067,  4115,  2349,    12,     3,  1092,  3306,    19,
            1,    89,     2,  1793,     1,   521,  2009,    15,     9,
            3,  3111],
       [  181,  3640,   972,   200,  2556,    44,  6775,  1722,  1252,
            5, 13319, 17939,     1,   778,    31,   740,  3990,    67,
           85,     0],
       [ 2349,    12,  1557,    38,  1094,   351,   775,     2,   367,
          260,  1770,     5,  4450,    70,   494,     0,     0,     0,
            0,     0],
       [    1,   700,   189,    19,     1,   427,    32,     3,  7417,
            4,  2159,  1252,     6,     3,  5270,     4,  1217,  1252,
           12,  3363],
       [  

### Header Encoding

In [16]:
text_heading = []
articles_heading = []

for idx in range(dataset['Headline'].shape[0]):
    text = dataset['Headline'][idx]
    text_heading.append(text)
    
    sentences = tokenize.sent_tokenize(text)
    articles_heading.append(sentences)

data_heading = np.zeros((len(texts),MAX_SENTS_HEADING,MAX_SENT_LENGTH),dtype = 'int32')

for i,sentences in enumerate(articles_heading):
    for j,sent in enumerate(sentences):
        if j < MAX_SENTS_HEADING:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _,word in enumerate(wordTokens):
                try:
                    if k < MAX_SENT_LENGTH and t.word_index[word] < MAX_NB_WORDS:
                        data_heading[ i , j , k] = t.word_index[word]
                        k += 1
                except :
                    pass

### Convert labels to one-hot vectors

In [17]:
labels = dataset['Stance']
targets = pd.Series(labels)

one_hot = pd.get_dummies(targets,sparse = True)
one_hot_labels = np.asarray(one_hot)

labels = one_hot_labels

In [18]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (49972, 20, 20)
Shape of label tensor: (49972, 4)


### Shuffling the data

In [19]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)

data = data[indices]
data_heading = data_heading[indices]
labels = labels[indices]

### Split data into train and validation set (80:20)

In [20]:
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
x_heading_train = data_heading[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]

x_val = data[-nb_validation_samples:]
x_heading_val = data_heading[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [21]:
print(x_train.shape)
print(y_train.shape)

print(x_val.shape)
print(y_val.shape)

(37479, 20, 20)
(37479, 4)
(12493, 20, 20)
(12493, 4)


### Create embedding matrix with the glove embeddings

In [22]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))


for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


## ML Model

1. Imports
1. Model Design
2. Model Fitting on Data

### Import layers from Keras to build the model

In [23]:
from keras.models import Sequential
from keras.layers import Dense,LSTM, TimeDistributed, Activation
from keras.layers import Flatten, Permute, merge, Input
from keras.layers import Embedding
from keras.models import Model
from keras.layers import Input, Dense, multiply, concatenate, Dropout

import tensorflow as tf
tf.compat.v1.disable_eager_execution()

### Model Design

In [24]:
MAX_SENT_LENGTH = 20
MAX_SENTS = 20
hidden_size = 100

from keras.layers import GRU, Bidirectional

sentence_input = Input(shape=(MAX_SENT_LENGTH,),dtype='int32')
print(sentence_input)
embedded_sequences = Embedding(output_dim = hidden_size, input_dim = vocab_size, input_length = (MAX_SENT_LENGTH,),)(sentence_input)

l_LSTM = Bidirectional(LSTM(100,return_sequences = True))(embedded_sequences)
l_dense = TimeDistributed(Dense(100))(l_LSTM)
l_dense = Flatten()(l_dense)
sentEncoder = Model(sentence_input,l_dense)

body_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH,),dtype = 'int32')
print(body_input)
body_encoder = TimeDistributed(sentEncoder)(body_input)
print(body_encoder)
l_LSTM_sent = Bidirectional(LSTM(100,return_sequences=True))(body_encoder)
l_dense_sent = TimeDistributed(Dense(100))(l_LSTM_sent)
l_dense_sent = Flatten()(l_dense_sent)

heading_input = Input(shape = (MAX_SENTS_HEADING, MAX_SENT_LENGTH),dtype = 'int32')
heading_embedded_sequences = Embedding(output_dim=hidden_size, input_dim=vocab_size, \
                                       input_length = (MAX_SENTS_HEADING,MAX_SENT_LENGTH,), \
                                      weights = [embedding_matrix])(heading_input)
h_dense = Dense(100,activation='relu')(heading_embedded_sequences)
h_flatten = Flatten()(h_dense)
article_output = concatenate([l_dense_sent,h_flatten],name = 'concatenate_heading')

news_vestor = Dense(100,activation = 'relu')(article_output)
preds = Dense(4,activation = 'softmax')(news_vestor)
model = Model([body_input,heading_input],[preds])

Tensor("input_1:0", shape=(None, 20), dtype=int32)
Tensor("input_2:0", shape=(None, 20, 20), dtype=int32)
Tensor("time_distributed_1/Reshape_1:0", shape=(None, 20, 2000), dtype=float32)


### Fitting the model

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics = ['acc'])
print("model fitting = Hierarchical LSTM network")
model.fit([x_train,x_heading_train], [y_train], validation_data = ([x_val, x_heading_val], [y_val]), epochs = 10 ,batch_size=62)

model.save(os.path.join(os.path.dirname(os.getcwd()), "src/models"))

model fitting = Hierarchical LSTM network
Train on 37479 samples, validate on 12493 samples
Epoch 1/10



Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: /home/apollo/Desktop/DJSCE/Projects/checkmate_fact_checker/src/models/assets
