## Predict Label with extracted evidence texts
This notebook builds the MLP model for RTM step according to the FNC competition paper.  

### Data preprocess

#### Load data as pandas DF

In [1]:
import json
import numpy as np
import pandas as pd

train_file_path = "./JSONFiles/" + "train_with_text.json"
use_test_file = False
if use_test_file:
    test_file_path = './JSONFiles/' + 'test_with_text.json'
else:
    test_file_path = './JSONFiles/' + 'dev_with_text.json'

with open(train_file_path, mode='r') as f:
    train = json.load(f)
with open(test_file_path, mode='r') as f:
    test = json.load(f)

def load_training_data(dataset: dict) -> list:
    dataset_list = []
    for key in dataset.keys():
        record = dataset.get(key)
        claim = record.get("claim")
        evi_texts = record.get("evidence_texts")
        text = ""
        for evi in evi_texts:
            text += evi
        SUP = NOINFO = REF = 0
        if record.get("label") == "SUPPORTS":
            SUP = 1
        elif record.get("label") == "REFUTES":
            REF = 1
        else:
            NOINFO = 1
        dataset_record = {
            "claim": claim,
            "evi_text": text,
            "SUP": SUP,
            "NOINFO": NOINFO,
            "REF": REF
        }
        dataset_list.append(dataset_record)
    return dataset_list

def load_test_data(dataset: dict) -> list:
    dataset_list = []
    for key in dataset.keys():
        record = dataset.get(key)
        claim = record.get("claim")
        evi_index = record.get("evidence")
        evi_texts = record.get("evidence_texts")
        text = ""
        for evi in evi_texts:
            text += evi

        dataset_record = {
            "key": key,
            "claim": claim,
            "evidence": evi_index,
            "evi_text": text
        }
        dataset_list.append(dataset_record)
    return dataset_list

train_df = pd.DataFrame(load_training_data(train))
test_df = pd.DataFrame(load_test_data(test))

train_df[0: 10]

Unnamed: 0,NOINFO,REF,SUP,claim,evi_text
0,0,1,0,Ireland does not have relatively low-lying mou...,Ireland 10 The island 's geography comprises r...
1,0,0,1,The drama Dark Matter stars Taylor Schilling.,Taylor_Schilling 2 She made her film debut in ...
2,0,0,1,"In 1932, Prussia was taken over.","Prussia 30 In the Weimar Republic , the state ..."
3,0,0,1,IZombie premiered in 2015.,IZombie_-LRB-TV_series-RRB- 2 The series premi...
4,0,0,1,Ronald Reagan had a nationality.,Ronald_Reagan 0 Ronald Wilson Reagan -LRB- -LS...
5,0,0,1,Samoa Joe wrestles professionally.,Samoa_Joe 0 Nuufolau Joel `` Joe '' Seanoa -LR...
6,0,0,1,University of Oxford is in the universe.,University_of_Oxford 0 The University of Oxfor...
7,1,0,0,The Renaissance began online.,Starwood_-LRB-nightclub-RRB- 1 Many punk bands...
8,0,0,1,Portia de Rossi appeared on Scandal.,Portia_de_Rossi 1 She appeared as a regular ca...
9,0,1,0,The Berlin Wall was only standing for 10 years.,Berlin_Wall 0 The Berlin Wall -LRB- Berliner M...


In [2]:
test_df[0: 10]

Unnamed: 0,claim,evi_text,evidence,key
0,Ripon College's student number totaled in at a...,Ripon_College_-LRB-Wisconsin-RRB- 1 As of 2015...,"[[Ripon_College_-LRB-Wisconsin-RRB-, 1]]",100038
1,"Kesha was baptized on March 1st, 1987.",Kesha 0 Kesha Rose Sebert -LRB- -LSB- ˈkɛʃə_ro...,"[[Kesha, 0]]",100083
2,Birthday Song (2 Chainz song) was banned by So...,Birthday_Song_-LRB-2_Chainz_song-RRB- 1 The so...,"[[Birthday_Song_-LRB-2_Chainz_song-RRB-, 1]]",100169
3,The University of Illinois at Chicago is a col...,University_of_Illinois_at_Chicago 0 The Univer...,"[[University_of_Illinois_at_Chicago, 0]]",100234
4,French Indochina was officially known as the I...,"Harukawa 5 , Japanese actress\n","[[Harukawa, 5]]",100359
5,Damon Albarn has refused to ever work with Bri...,Damon_Albarn 17 His debut solo studio album Ev...,"[[Damon_Albarn, 17]]",100366
6,Lost (TV series) is a series of plays.,Lost_-LRB-TV_series-RRB- 0 Lost is an American...,"[[Lost_-LRB-TV_series-RRB-, 0]]",100429
7,Edison Machine Works was barely set up to prod...,List_of_professional_Magic-COLON-_The_Gatherin...,[[List_of_professional_Magic-COLON-_The_Gather...,100457
8,The human brain is set apart from mammalian br...,"Barn_River 1 Only 3 km long , it acts as the p...","[[Barn_River, 1]]",100461
9,"There are rumors that Augustus' wife, Livia, p...",The_Malpractice_-LRB-band-RRB- 1 His debutalbu...,"[[The_Malpractice_-LRB-band-RRB-, 1]]",100481


#### Tokenization and Lemmatization

In [3]:
import nltk
nltk.download('stopwords')

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def pre_process(comment) -> str:
    # lower cased
    comment = comment.lower()
    # tokenize
    words = tokenizer.tokenize(comment)
    # lemmatize 
    words = [lemmatize(w) for w in words]
    # remove stop words
#     stop_words = nltk.corpus.stopwords.words('english')
#     words = [w for w in words if not w in stop_words]
    # return result
    processed_comment = " ".join(words)
    return processed_comment

def process_dataset(dataset: pd.DataFrame) -> pd.DataFrame:
    corpus = pd.concat([dataset['claim'], dataset['evi_text']])
    processed_corpus = corpus.apply(lambda text: pre_process(text))
    dataset['claim'] = processed_corpus.iloc[0: len(dataset)]
    dataset['evi_text'] = processed_corpus.iloc[len(dataset):,]
    return dataset

train_df = process_dataset(train_df)
test_df = process_dataset(test_df)
train_df[0: 10]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wenbin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,NOINFO,REF,SUP,claim,evi_text
0,0,1,0,ireland do not have relatively low lie mountain,ireland 10 the island s geography comprise rel...
1,0,0,1,the drama dark matter star taylor schilling,taylor_schilling 2 she make her film debut in ...
2,0,0,1,in 1932 prussia be take over,prussia 30 in the weimar republic the state of...
3,0,0,1,izombie premier in 2015,izombie_ lrb tv_series rrb 2 the series premie...
4,0,0,1,ronald reagan have a nationality,ronald_reagan 0 ronald wilson reagan lrb lsb ˈ...
5,0,0,1,samoa joe wrestle professionally,samoa_joe 0 nuufolau joel joe seanoa lrb bear ...
6,0,0,1,university of oxford be in the universe,university_of_oxford 0 the university of oxfor...
7,1,0,0,the renaissance begin online,starwood_ lrb nightclub rrb 1 many punk band a...
8,0,0,1,portia de rossi appear on scandal,portia_de_rossi 1 she appear a a regular cast ...
9,0,1,0,the berlin wall be only stand for 10 year,berlin_wall 0 the berlin wall lrb berliner mau...


### Feature extraction

#### TF Features

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import coo_matrix, hstack

max_features = 5000 

train_corpus = pd.concat([train_df['claim'], train_df['evi_text']])
test_corpus = pd.concat([test_df['claim'], test_df['evi_text']])

tf_vectorizer = CountVectorizer(max_features=max_features)
tf_vectorizer.fit(train_corpus)
train_claim_tf_features = tf_vectorizer.transform(train_df['claim'])
train_evi_tf_features = tf_vectorizer.transform(train_df['evi_text'])
test_claim_tf_features = tf_vectorizer.transform(test_df['claim'])
test_evi_tf_features = tf_vectorizer.transform(test_df['claim'])

train_tf_features = hstack([train_claim_tf_features, train_evi_tf_features])
test_tf_features = hstack([test_claim_tf_features, test_evi_tf_features])
# claim_tf_vectorizer = CountVectorizer(max_features=max_features)
# claim_tf = claim_tf_vectorizer.fit_transform(train_df['claim'])
# evi_text_tf_vectorizer = CountVectorizer(max_features=max_features)
# evi_text_tf = evi_text_tf_vectorizer.fit_transform(train_df['evi_text'])
# tf_features = hstack([claim_tf, evi_text_tf])

# tf_features

#### TF_IDF Cosine similarity

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

max_features = 5000

all_corpus = pd.concat([train_corpus, test_corpus])

def calculate_cosines(claim_tfidf, evi_tfidf) -> np.ndarray:
    cosines = np.zeros((claim_tfidf.shape[0], 1))
    for i in range(len(cosines)):
        claim_vector = claim_tfidf[i]
        evi_vector = evi_tfidf[i]
        cosine_matrix = cosine_similarity([claim_vector.toarray()[0], evi_vector.toarray()[0]])
        cosines[i][0] = cosine_matrix[0][1]
    return cosines

tfidf_vectorizer = TfidfVectorizer(max_features=max_features, norm='l2')
tfidf_vectorizer.fit(all_corpus)

train_claim_tfidf = tfidf_vectorizer.transform(train_df['claim'])
train_evi_tfidf = tfidf_vectorizer.transform(train_df['evi_text'])
train_cosines = calculate_cosines(train_claim_tfidf, train_evi_tfidf)

test_claim_tfidf = tfidf_vectorizer.transform(test_df['claim'])
test_evi_tfidf = tfidf_vectorizer.transform(test_df['evi_text'])
test_cosines = calculate_cosines(test_claim_tfidf, test_evi_tfidf)



#### Concat features together

In [6]:
x_train = hstack([train_tf_features, train_cosines]).toarray()
y_train = train_df[train_df.columns[0:3]].values
x_test = hstack([test_tf_features, test_cosines]).toarray()

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)

(145449, 10001)
(145449, 3)
(5001, 10001)


## Build and Train model
Build an MLP with tensor (10001, 1) as input, 1 hidden layer with 100 neurons, and softmax layer for output. 

### Simple MLP model prototype

In [7]:
# # from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
# # from keras.layers import Bidirectional, GlobalMaxPool1D
# import keras
# from keras.callbacks import ModelCheckpoint, EarlyStopping
# from keras.models import Sequential
# from keras.layers import Dense, Dropout
# from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot

# model = Sequential()
# model.add(Dense(units=100, activation='relu', input_dim=x_train.shape[1]))
# model.add(Dropout(0.3))
# model.add(Dense(units=3, activation='softmax'))
# model.compile(loss=keras.losses.categorical_crossentropy,
#               optimizer='adam', metrics=['accuracy'])

# model.summary()
# # SVG(model_to_dot(model).create(prog='dot', format='svg'))

# # callbacks
# filepath="best_weights.hdf5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
# earlyStopping = EarlyStopping(monitor='val_acc', patience=1, verbose=0, mode='min')

# callbacks_list = [checkpoint, earlyStopping]

# model.fit(x=x_train, y=y_train, batch_size=128, epochs=10, validation_split=0.1, callbacks=callbacks_list)

### Tune hyper-parameters mannually

In [8]:
import itertools

class Cartesian(object):
    def __init__(self):
        self._data_list = []
        self._name_list = []
        self.cartesian_result = []

    def add_data(self, data, name): #add list for cartesian product
        self._data_list.append(data)
        self._name_list.append(name)

    def build(self): #calculate cartesian product
        for item in itertools.product(*self._data_list):
            result_dict = {}
            for i in range(len(item)):
                result_dict.update({
                    self._name_list[i]: item[i]
                })
            self.cartesian_result.append(result_dict)
        return self.cartesian_result


In [46]:
# from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
# from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils.vis_utils import model_to_dot
from keras.optimizers import Adam
import keras

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

def create_model(units = 100, dropout = 0.5, lr = 0.001):
    model = Sequential()
    model.add(Dense(units=units, activation='relu', input_dim=x_train.shape[1]))
    model.add(Dropout(dropout))
    model.add(Dense(units=3, activation='softmax'))
    optimizer = Adam(lr=lr)
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer='adam', metrics=['accuracy'])
    return model

def fit_model(model, batch_size=32):
    earlyStopping = EarlyStopping(monitor='val_acc', patience=4, 
                                  verbose=0, mode='auto')
    EarlyStopping()
    callbacks_list = [earlyStopping]

    model_history = model.fit(x=x_train, y=y_train, 
                              batch_size=batch_size, epochs=50, 
                              validation_split=0.1, callbacks=callbacks_list, verbose=1)
    return model_history

units_list = [25, 50, 100, 250, 500]
dropout_list = [0.3, 0.4, 0.5, 0.6, 0.7]
batch_size_list = [16, 32, 64, 128, 256, 512]
lr_list = [0.0001, 0.001, 0.01, 0.1]
    
car_product=Cartesian()
car_product.add_data(units_list, 'units')
car_product.add_data(dropout_list, 'dropout')
car_product.add_data(batch_size_list, 'batch_size')
car_product.add_data(lr_list, 'lr')
parameter_combinations = car_product.build()

historys_list = []
iternum = 0
for combination in parameter_combinations:
    print("itertion: " + str(iternum))
    print(combination)
    model = create_model(units=combination['units'], 
                         dropout=combination['dropout'], 
                         lr=combination['lr'])
    model_history = fit_model(model=model, batch_size=combination['batch_size'])
    historys_list.append({
        'combination': combination,
        'max_val_acc': max(model_history.history['val_acc'])
    })
    print("result: " + str(max(model_history.history['val_acc'])))
    iternum += 1


itertion: 0
{'units': 25, 'dropout': 0.3, 'batch_size': 16, 'lr': 1e-05}
Train on 130904 samples, validate on 14545 samples
Epoch 1/50

KeyboardInterrupt: 

In [42]:
# sort and output_to_file
ordered_history = sorted(historys_list, key= lambda x: x['max_val_acc'], reverse=True)

historys_list_dict = {
    "historys": ordered_history
}
with open('tune_hps.json', 'w') as hp_result:
    json.dump(ordered_history, hp_result, indent=4)
    

### Tune hyper-parameters with sklearn

In [None]:
# from sklearn.model_selection import GridSearchCV
# from keras.wrappers.scikit_learn import KerasClassifier
# from sklearn.model_selection import train_test_split
# import keras
# from keras.callbacks import ModelCheckpoint, EarlyStopping
# from keras.models import Sequential
# from keras.layers import Dense, Dropout
# from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot


# # # fix random seed for reproducibility
# seed = 7
# np.random.seed(seed)

# def create_model():
#     model = Sequential()
#     model.add(Dense(units=100, activation='relu', input_dim=x_train.shape[1]))
#     model.add(Dropout(0.3))
#     model.add(Dense(units=3, activation='softmax'))
#     model.compile(loss=keras.losses.categorical_crossentropy,
#                   optimizer='adam', metrics=['accuracy'])
#     return model

# model = KerasClassifier(build_fn=create_model, verbose=2)
# batch_size = [64, 128]
# # epochs = [1, 2]
# param_grid = dict(batch_size=batch_size)
# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=2)
# grid_result = grid.fit(X=x_train, y=y_train)


# # summarize results
# # print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# # means = grid_result.cv_results_['mean_test_score']
# # stds = grid_result.cv_results_['std_test_score']
# # params = grid_result.cv_results_['params']
# # for mean, stdev, param in zip(means, stds, params):
# #     print("%f (%f) with: %r" % (mean, stdev, param))

# grid_result.best_params_

In [None]:
def create_model():
    model = Sequential()
    model.add(Dense(units=100, activation='relu', input_dim=x_train.shape[1]))
    model.add(Dropout(0.3))
    model.add(Dense(units=3, activation='softmax'))
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer='adam', metrics=['accuracy'], verbose=2)
    return model

def fit_model():
    # callbacks
    filepath="best_weights.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    earlyStopping = EarlyStopping(monitor='val_acc', patience=1, verbose=0, mode='min')

    callbacks_list = [checkpoint, earlyStopping]

    model.fit(x=x_train, y=y_train, batch_size=128, epochs=10, validation_split=0.1, callbacks=callbacks_list)


## Apply model

### Apply model on test data

In [None]:
model.load_weights("best_weights.hdf5")
y_test = model.predict(x_test, batch_size=128, verbose=1)
y_test

### Output result to file

In [None]:
result_dict = {}

for i in range(len(test_df)):
    if np.argmax(y_test[i]) == 0:
        label = "NOT ENOUGH INFO"
    elif np.argmax(y_test[i]) == 1:
        label = "REFUTES"
    else:
        label = "SUPPORTS"
    key = test_df['key'][i]
    result_dict.update({
        key:{
            "claim": test_df['claim'][i],
            "label": label,
            "evidence": test_df['evidence'][i]
        }
    })
    
with open('result_on_dev.json', 'w') as outfile:
    json.dump(result_dict, outfile, indent=4)