## Deploy BERT server

Instruction website: https://bert-as-service.readthedocs.io/en/latest/section/get-start.html  
Download server and client:
``` bash
pip install -U bert-serving-server bert-serving-client  
```
Downlaod and unzip pretrained bert model(BERT-Large, Uncased, 1024 dimensional output):  
``` bash
cd ${model_path}
wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip
unzip uncased_L-24_H-1024_A-16.zip  
```  

Start bert server at local machine: 
``` bash
bert-serving-start -model_dir ${model_path}/uncased_L-24_H-1024_A-16 -max_seq_len=100 -num_worker=1  
bert-serving-start -model_dir /share/ShareFolder/uncased_L-24_H-1024_A-16/ -max_seq_len=150 -gpu_memory_fraction=0.9 -num_worker=1
```
Then, call from client end in python:
``` python
from bert_serving.client import BertClient
bc = BertClient()
bc.encode(['First do it', 'then do it right', 'then do it better'])
```


## Load data as Pandas dataframe

In [None]:
import json
import numpy as np
import pandas as pd

train_file_path = "./JSONFiles/" + "train_with_text.json"
use_test_file = False
if use_test_file:
    test_file_path = './JSONFiles/' + 'test_with_text.json'
else:
    test_file_path = './JSONFiles/' + 'dev_with_text.json'

with open(train_file_path, mode='r') as f:
    train = json.load(f)
with open(test_file_path, mode='r') as f:
    test = json.load(f)

def load_training_data(dataset: dict) -> list:
    dataset_list = []
    for key in dataset.keys():
        record = dataset.get(key)
        claim = record.get("claim")
        evi_texts = record.get("evidence_texts")
        text = ''.join(evi_texts)
        if len(text) == 0:
            text = "no word"

        SUP = NOINFO = REF = 0
        if record.get("label") == "SUPPORTS":
            SUP = 1
        elif record.get("label") == "REFUTES":
            REF = 1
        else:
            NOINFO = 1
        dataset_record = {
            "claim": claim,
            "evi_text": text,
            "claim_with_evi_text": claim + " ||| " + text,
            "SUP": SUP,
            "NOINFO": NOINFO,
            "REF": REF
        }
        dataset_list.append(dataset_record)
    return dataset_list

def load_test_data(dataset: dict) -> list:
    dataset_list = []
    for key in dataset.keys():
        record = dataset.get(key)
        claim = record.get("claim")
        evi_index = record.get("evidence")
        evi_texts = record.get("evidence_texts")
        text = ''.join(evi_texts)
        if len(text) == 0:
            text = "no word"
            
        dataset_record = {
            "key": key,
            "claim": claim,
            "evidence": evi_index,
            "claim_with_evi_text": claim + " ||| " + text,
            "evi_text": text
        }
        dataset_list.append(dataset_record)
    return dataset_list

train_df = pd.DataFrame(load_training_data(train))
test_df = pd.DataFrame(load_test_data(test))

train_df[0: 10]

In [None]:
test_df[0: 10]

## Feature extraction

### Construct and save bert features to file for reuse

In [None]:
from bert_serving.client import BertClient
bc = BertClient()

# result = bert_embedding(sentences)

In [None]:
# train, test claim encode
train_claim_encode = bc.encode(list(train_df['claim']))
np.save("./BERT_MLP_encodings/train_claim_encode", train_claim_encode)

test_claim_encode = bc.encode(list(test_df['claim']))
np.save("./BERT_MLP_encodings/test_claim_encode", test_claim_encode)

# train, test evidence encode
train_evi_encode = bc.encode(list(train_df['evi_text']))
np.save("./BERT_MLP_encodings/train_evi_encode", train_evi_encode)

test_evi_encode = bc.encode(list(test_df['evi_text']))
np.save("./BERT_MLP_encodings/test_evi_endcode", test_evi_encode)

# train, test claim+evidence pair encode
train_pair_encode = bc.encode(list(train_df['claim_with_evi_text']))
np.save("./BERT_MLP_encodings/train_pair_encode", train_pair_encode)

test_pair_encode = bc.encode(list(test_df['claim_with_evi_text']))
np.save("./BERT_MLP_encodings/test_pair_encode", test_pair_encode)

### Load bert features from file

In [None]:
# from scipy.sparse import coo_matrix, hstack

# train_claim_features = np.load("train_claim_encode.npy")
# train_evi_features = np.load("train_evi_encode.npy")
# test_claim_features = np.load("test_claim_encode.npy")
# test_evi_features = np.load("test_evi_encode.npy")

# x_train = hstack([train_claim_features, train_evi_features])
# y_train = train_df[train_df.columns[0:3]].values
# test_features = hstack([test_claim_features, test_evi_features])