## Predict Label with extracted evidence texts
This notebook builds the MLP model for RTM step according to the FNC competition paper.  

### Data preprocess

#### Load data as pandas DF

In [None]:
import json
import numpy as np
import pandas as pd

file_path = "./JSONFiles/"

with open(file=file_path+"train_with_text.json", mode='r') as f:
    train = json.load(f)

train_list = []
for key in train.keys():
    record = train.get(key)
    claim = record.get("claim")
    evi_texts = record.get("evidence_texts")
    text = ""
    for evi in evi_texts:
        text += evi
    SUP = NOINFO = REF = 0
    if record.get("label") == "SUPPORTS":
        SUP = 1
    elif record.get("label") == "REFUTES":
        REF = 1
    else:
        NOINFO = 1
    train_record = {
        "claim": claim,
        "evi_text": text,
        "SUP": SUP,
        "NOINFO": NOINFO,
        "REF": REF
    }
    train_list.append(train_record)

train_df = pd.DataFrame(train_list)

train_df[0: 10]

#### Tokenization and Lemmatization

In [None]:
import nltk
nltk.download('stopwords')

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def pre_process(comment) -> str:
    # lower cased
    comment = comment.lower()
    # tokenize
    words = tokenizer.tokenize(comment)
    # lemmatize 
    words = [lemmatize(w) for w in words]
    # remove stop words
#     stop_words = nltk.corpus.stopwords.words('english')
#     words = [w for w in words if not w in stop_words]
    # return result
    processed_comment = " ".join(words)
    return processed_comment

corpus = pd.concat([train_df['claim'], train_df['evi_text']])
processed_corpus = corpus.apply(lambda text: pre_process(text))
train_df['claim'] = processed_corpus.iloc[0: len(train)]
train_df['evi_text'] = processed_corpus.iloc[len(train):,]

train_df[0: 10]

### Feature extraction

#### TF Features

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import coo_matrix, hstack

max_features = 5000 

claim_tf_vectorizer = CountVectorizer(max_features=max_features)
claim_tf = claim_tf_vectorizer.fit_transform(train_df['claim'])
evi_text_tf_vectorizer = CountVectorizer(max_features=max_features)
evi_text_tf = evi_text_tf_vectorizer.fit_transform(train_df['evi_text'])
tf_features = hstack([claim_tf, evi_text_tf])

tf_features

#### TF_IDF Cosine similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

max_features = 5000

claim_tfidf_vectorizer = TfidfVectorizer(max_features=max_features, norm='l2')
claim_tfidf = claim_tf_vectorizer.fit_transform(train_df['claim'])
evi_tfidf_vectorizer = TfidfVectorizer(max_features=max_features, norm='l2')
evi_tfidf = evi_text_tf_vectorizer.fit_transform(train_df['evi_text'])

cosines = np.zeros((claim_tfidf.shape[0], 1))
for i in range(len(cosines)):
    claim_vector = claim_tfidf[i]
    evi_vector = evi_tfidf[i]
    cosine_matrix = cosine_similarity([claim_vector.toarray()[0], evi_vector.toarray()[0]])
    cosines[i][0] = cosine_matrix[0][1]



#### Concat features together

In [None]:
x_train = hstack([tf_features, cosines]).toarray()
y_train = train_df[train_df.columns[0:3]].values

## Build and Train model
Build an MLP with tensor (10001, 1) as input, 1 hidden layer with 100 neurons, and softmax layer for output. 

In [None]:
# import keras
# from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
# from keras.layers import Bidirectional, GlobalMaxPool1D
import keras
from keras.models import Sequential
from keras.layers import Dense
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(units=3, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True))

model.summary()
SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
model.fit(x=x_train, y=y_train, batch_size=128, epochs=1, validation_split=0.1)
# x_train.shape

## Apply model

### Apply model on dev data

### Apply model on test data