In [1]:
! pip install transformers
! pip install tensorflow==2.3.0
import tensorflow as tf
from transformers import RobertaConfig, AutoTokenizer, TFAutoModel, TFAutoModelForSequenceClassification
# import tensorflow as tf
tf.config.list_physical_devices('GPU')

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.3 MB/s eta 0:00:01
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 2.4 MB/s  eta 0:00:01
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 44.8 MB/s eta 0:00:01
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 50.4 MB/s eta 0:00:01
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.1.2 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.12.5
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/amazonei_tensorflow2_

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 320.4 MB 43 kB/s 
Collecting numpy<1.19.0,>=1.16.0
  Downloading numpy-1.18.5-cp36-cp36m-manylinux1_x86_64.whl (20.1 MB)
[K     |████████████████████████████████| 20.1 MB 116.9 MB/s eta 0:00:01
Collecting scipy==1.4.1
  Downloading scipy-1.4.1-cp36-cp36m-manylinux1_x86_64.whl (26.1 MB)
[K     |████████████████████████████████| 26.1 MB 114.6 MB/s eta 0:00:01
Installing collected packages: numpy, scipy, tensorflow
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.5
    Uninstalling numpy-1.19.5:
      Successfully uninstalled numpy-1.19.5
  Attempting uninstall: scipy
    Found existing installation: scipy 1.5.3
    Uninstalling scipy-1.5.3:
      Successfully uninstalled scipy-1.5.3
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.3.4
    Uninstalling tensorflow-2.3.4:
      Successfully uninstalled tensorflow-2.3.4
Successfully installed numpy-1.18.5 scipy-1.4.1 tensorflow-2.3.0
You sho

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
import numpy as np
import pandas as pd
import argparse
import logging
import os
import sys
import csv
import s3fs
import json
import re
import gc
import scipy.stats as st
from sklearn.metrics import precision_recall_curve


fs = s3fs.S3FileSystem()
MAX_LEN = 128
PEACE_COUNTRY = set(['Australia', 'New Zealand', 
                 'Belgium', 'Sweden', 'Denmark', 
                 'Norway', 'Finland', 'Czech Republic', 
                 'Netherlands', 'Austria'])
MAJOR_COUNTRY = set(['Australia', 'India'])

# Initialize tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
config = RobertaConfig.from_pretrained(
    'roberta-base',
    num_labels=1, #Binary Classification
    dropout=0.1,
    attention_dropout=0.1,
    output_hidden_states=False,
    output_attentions=False
)

def regular_encode(texts, tokenizer, maxlen=MAX_LEN):
    """
    Function to encode the word
    """
    # encode the word to vector of integer
    enc_di = tokenizer.encode_plus(
        texts, 
        return_attention_mask=True, 
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        max_length=maxlen)
    
    return np.array(enc_di['input_ids']), np.array(enc_di['attention_mask'])

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

## Initialize Model getter

In [4]:
def get_model(lr = 3e-5):
    bert_model = TFAutoModelForSequenceClassification.from_pretrained('roberta-base', trainable=True, config=config)
    input_ids_in = tf.keras.layers.Input(shape=(MAX_LEN,), name='input_ids', dtype='int32')
    input_masks_ids_in = tf.keras.layers.Input(shape=(MAX_LEN,), name='attention_mask', dtype='int32')
    output_layer = bert_model(input_ids_in, input_masks_ids_in)[0]
    output_layer = tf.keras.layers.Activation(activation='sigmoid')(output_layer)
    model = tf.keras.Model(inputs=[input_ids_in, input_masks_ids_in], outputs = output_layer)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
    metrics = [tf.keras.metrics.BinaryAccuracy(name='accuracy', threshold=0.5),
               tf.keras.metrics.Precision(name='precision', thresholds=0.5),
               tf.keras.metrics.Recall(name='recall', thresholds=0.5),
               tf.keras.metrics.TruePositives(name='TP', thresholds=0.5),
               tf.keras.metrics.TrueNegatives(name='TN', thresholds=0.5),
               tf.keras.metrics.FalsePositives(name='FP', thresholds=0.5),
               tf.keras.metrics.FalseNegatives(name='FN', thresholds=0.5)]
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    
    return model

## Load Unshuffled, Original Data

In [5]:
# Load in data
train_count = 0  
val_count = 0  
train_label_counter = [0, 0]
val_label_counter = [0, 0]

train_label_count_max = 5e3 
val_label_count_max = 1e3
total_train = 2 * train_label_count_max
total_val = 2 * val_label_count_max

X_train = []
y_train = []

X_val = []
y_val = []

for line in fs.open('s3://compressed-data-sample/processed_train.json'):
    if train_count >= total_train and val_count >= total_val:
        break
    json_file = json.loads(line)
    country = json_file['country']
    label =  int(json_file['country'] in PEACE_COUNTRY)
    
    if not country in MAJOR_COUNTRY:
        if train_label_counter[label] < train_label_count_max :
            sent = json_file['content_cleaned']
            ids, msk = regular_encode(sent, tokenizer) # tokenize content_cleaned
            
            X_train.append({'input_ids': ids,'attention_mask':msk})
            y_train.append(json_file['country'])
            train_count += 1
            train_label_counter[label] += 1
            if sum(train_label_counter) % 1e3 == 0:
                print('Train', train_label_counter)
    else:
        if val_label_counter[label] < val_label_count_max :
            sent = json_file['content_cleaned']
            ids, msk = regular_encode(sent, tokenizer) # tokenize content_cleaned
            
            X_val.append({'input_ids': ids,'attention_mask':msk})
            y_val.append(json_file['country'])
            val_count += 1
            val_label_counter[label] += 1
            if sum(val_label_counter) % 1e3 == 0:
                print('Val', val_label_counter)

Val [532, 468]
Val [1000, 1000]
Train [240, 760]
Train [481, 1519]
Train [749, 2251]
Train [1028, 2972]
Train [1268, 3732]
Train [1522, 4478]
Train [2000, 5000]
Train [3000, 5000]
Train [4000, 5000]
Train [5000, 5000]


In [6]:
from collections import Counter
import statistics

median = statistics.median(list(Counter(y_train).values()))
median

287

## Prepare K-Fold for minority country

In [7]:
X_train = np.array(X_train)
X_val = np.array(X_val)

y_train = np.array(y_train)
y_val = np.array(y_val)

In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import random

## Split minority samples to 10 folds
# skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
# minority_index = [val_split for _, val_split in skf.split(X_train,y_train)]

## draw meidan * 10 number of samples from each majoriy
australia_idx = np.where(np.array(y_val) == 'Australia')[0]
india_idx = np.where(np.array(y_val) == 'India')[0]
majority_index = []
for i in range(10):
    australia_sample = np.array(random.sample(list(australia_idx), median))
    india_sample = np.array(random.sample(list(india_idx), median))
    majority_index.append(np.hstack([australia_sample, india_sample]))

In [9]:
eval_results = []
metrics_names = None
    
for maj_index in majority_index:
    tf.keras.backend.clear_session()
    X_all_sample = np.hstack([X_train, X_val[maj_index]])
    y_all_sample = np.hstack([y_train, y_val[maj_index]])
    y_all_sample = np.array(list(map(lambda x: int(x in PEACE_COUNTRY), y_all_sample)))
    
    X_train_fold, X_val_fold, y_train_fold, y_val_fold = train_test_split(X_all_sample, y_all_sample, test_size=0.2)
    
    
    model = get_model()
        
    train_input1 = np.vstack([x['input_ids'] for x in X_train_fold])
    train_input2 = np.vstack([x['attention_mask'] for x in X_train_fold])
    model.fit(x=[train_input1, train_input2], 
              y=np.asarray(y_train_fold),
              epochs = 1, 
              batch_size = 32,
              class_weight={0: 1., 1: 1.})

    eval_input1 = np.vstack([x['input_ids'] for x in X_val_fold])
    eval_input2 = np.vstack([x['attention_mask'] for x in X_val_fold])
    er = model.evaluate(x=[eval_input1, eval_input2], 
                        y=np.asarray(y_val_fold), return_dict=True)
    f1 = 2*er['precision']*er['recall'] / (er['precision']+er['recall'])

    precision_neg = er['TN'] / (er['TN'] + er['FN'])
    recall_neg = er['TN'] / (er['TN']+er['FP'])
    f1_neg = 2 * precision_neg * recall_neg / (precision_neg + recall_neg)

    er = list(er.values())
    er += [f1, precision_neg, recall_neg, f1_neg]
    eval_results.append(er)
    metrics_names = model.metrics_names
    print(er)

    
eval_results = np.array(eval_results)
metrics_names += ['f1', 'precision_neg', 'recall_neg', 'f1_neg']
eval_results = pd.DataFrame(eval_results, columns=metrics_names)
ci = eval_results.apply(lambda x: st.t.interval(0.95, len(x), loc=np.mean(x), scale=st.sem(x)))
ci.index = ['ci_lower', 'ci_upper']
eval_results = pd.concat([eval_results.describe(), ci])
eval_results

Downloading:   0%|          | 0.00/627M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.10004357993602753, 0.964066207408905, 0.9533146619796753, 0.9751671552658081, 1021.0, 1018.0, 50.0, 26.0, 0.9641170984165603, 0.975095785440613, 0.9531835205992509, 0.9640151515151515]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.1253553181886673, 0.9503546357154846, 0.9737903475761414, 0.9244019389152527, 966.0, 1044.0, 26.0, 79.0, 0.9484536332830832, 0.9296527159394479, 0.9757009345794393, 0.9521203830369358]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.1507667601108551, 0.9460992813110352, 0.9799618124961853, 0.9169642925262451, 1027.0, 974.0, 21.0, 93.0, 0.9474169686622305, 0.9128397375820057, 0.978894472361809, 0.9447138700290979]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.12536859512329102, 0.949409008026123, 0.9672293663024902, 0.9293892979621887, 974.0, 1034.0, 33.0, 74.0, 0.9479318522522303, 0.9332129963898917, 0.9690721649484536, 0.9508045977011494]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.1363515704870224, 0.9460992813110352, 0.9463087320327759, 0.9444975852966309, 987.0, 1014.0, 56.0, 58.0, 0.9454022912430987, 0.9458955223880597, 0.9476635514018692, 0.9467787114845938]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.12244981527328491, 0.9560283422470093, 0.9715157747268677, 0.936274528503418, 955.0, 1067.0, 28.0, 65.0, 0.9535696584429428, 0.9425795053003534, 0.9744292237442922, 0.9582397844634036]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.12823329865932465, 0.9546099305152893, 0.9580078125, 0.9487427473068237, 981.0, 1038.0, 43.0, 53.0, 0.953352770028538, 0.9514207149404217, 0.9602220166512488, 0.9558011049723758]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.10388487577438354, 0.9593380689620972, 0.9414455890655518, 0.9790675640106201, 1029.0, 1000.0, 64.0, 22.0, 0.959888077890602, 0.9784735812133072, 0.9398496240601504, 0.9587727708533077]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.11397368460893631, 0.9579195976257324, 0.9622458815574646, 0.9521072506904602, 994.0, 1032.0, 39.0, 50.0, 0.9571497184556493, 0.9537892791127541, 0.9635854341736695, 0.9586623316302834]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.1328258514404297, 0.9513002634048462, 0.966469407081604, 0.9342230558395386, 980.0, 1032.0, 34.0, 69.0, 0.95007269242448, 0.9373297002724795, 0.9681050656660413, 0.9524688509460083]


Unnamed: 0,loss,accuracy,precision,recall,TP,TN,FP,FN,f1,precision_neg,recall_neg,f1_neg
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.123925,0.953522,0.962029,0.944084,991.4,1025.3,39.4,58.9,0.952735,0.946029,0.963071,0.954238
std,0.015092,0.005906,0.012291,0.020476,26.022213,25.534508,13.841965,22.343033,0.006072,0.020023,0.012837,0.005962
min,0.100044,0.946099,0.941446,0.916964,955.0,974.0,21.0,22.0,0.945402,0.91284,0.93985,0.944714
25%,0.116093,0.949645,0.954488,0.930598,975.5,1015.0,29.25,50.75,0.948062,0.934242,0.954943,0.951134
50%,0.125362,0.952955,0.964358,0.940386,984.0,1032.0,36.5,61.5,0.951713,0.944238,0.965845,0.954135
75%,0.131678,0.957447,0.970444,0.951266,1014.25,1037.0,48.25,72.75,0.956255,0.953197,0.97309,0.958557
max,0.150767,0.964066,0.979962,0.979068,1029.0,1067.0,64.0,93.0,0.964117,0.978474,0.978894,0.964015
ci_lower,0.113291,0.949361,0.953369,0.929656,973.064765,1007.308402,29.64696,43.157113,0.948457,0.931921,0.954026,0.950037
ci_upper,0.134559,0.957684,0.970689,0.958511,1009.735235,1043.291598,49.15304,74.642887,0.957014,0.960137,0.972115,0.958439


In [18]:
# View results on training data
eval_results = np.array([
    [0.1000, 0.9641, 0.9533, 0.9752, 1021.0000, 1018.0000, 50.0000, 26.0000],
    [0.2332, 0.9025, 0.9021, 0.9036, 3833.0000, 3801.0000, 416.0000, 409.0000],
    [0.2263, 0.9006, 0.9074, 0.8889, 3704.0000, 3914.0000, 378.0000, 463.0000],
    [0.2038, 0.9119, 0.9091, 0.9158, 3882.0000, 3832.0000, 388.0000, 357.0000],
    [0.2083, 0.9149, 0.9011, 0.9326, 3956.0000, 3783.0000, 434.0000, 286.0000],
    [0.2180, 0.9102, 0.8917, 0.9356, 3992.0000, 3707.0000, 485.0000, 275.0000],
    [0.2247, 0.9091, 0.9034, 0.9172, 3901.0000, 3789.0000, 417.0000, 352.0000],
    [0.2132, 0.9094, 0.9079, 0.9117, 3862.0000, 3831.0000, 392.0000, 374.0000],
    [0.2083, 0.9161, 0.9152, 0.9177, 3894.0000, 3855.0000, 361.0000, 349.0000],
    [0.2200, 0.9076, 0.8885, 0.9325, 3952.0000, 3725.0000, 496.0000, 286.0000]
])
eval_results = pd.DataFrame(eval_results, columns=metrics_names[:8])
eval_results['F1'] =  2*eval_results['precision']*eval_results['recall'] / (eval_results['precision']+eval_results['recall'])
eval_results['precision_neg'] = eval_results['TN'] / (eval_results['TN'] + eval_results['FN'])
eval_results['recall_neg'] = eval_results['TN'] / (eval_results['TN']+eval_results['FP'])
eval_results['f1_neg'] = 2 * eval_results['precision_neg'] * eval_results['recall_neg'] / (eval_results['precision_neg'] + eval_results['recall_neg'])

ci = eval_results.apply(lambda x: st.t.interval(0.95, len(x), loc=np.mean(x), scale=st.sem(x)))
ci.index = ['ci_lower', 'ci_upper']
eval_results = pd.concat([eval_results.describe(), ci])
eval_results

Unnamed: 0,loss,accuracy,precision,recall,TP,TN,FP,FN,F1,precision_neg,recall_neg,f1_neg
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.20558,0.91464,0.90797,0.92308,3599.7,3525.5,381.7,317.7,0.915363,0.921933,0.90608,0.91383
std,0.038214,0.018037,0.017811,0.023205,909.569746,883.095094,124.501718,117.96426,0.018063,0.02208,0.019687,0.018152
min,0.1,0.9006,0.8885,0.8889,1021.0,1018.0,50.0,26.0,0.898055,0.89422,0.882492,0.9021
25%,0.2083,0.907975,0.90135,0.912725,3840.25,3739.5,380.5,286.0,0.909839,0.911988,0.898026,0.905509
50%,0.2156,0.9098,0.9054,0.91745,3888.0,3795.0,404.0,350.5,0.911343,0.91599,0.904263,0.908492
75%,0.223525,0.91415,0.9088,0.932575,3939.25,3831.75,429.75,369.75,0.915617,0.929458,0.910961,0.912681
max,0.2332,0.9641,0.9533,0.9752,3992.0,3914.0,496.0,463.0,0.964126,0.975096,0.953184,0.964015
ci_lower,0.178654,0.901931,0.895421,0.90673,2958.817729,2903.271752,293.976168,234.582456,0.902636,0.906375,0.892209,0.90104
ci_upper,0.232506,0.927349,0.920519,0.93943,4240.582271,4147.728248,469.423832,400.817544,0.92809,0.937491,0.919952,0.92662


## Load Shuffled Data

In [10]:
# Load in data
train_count = 0  
val_count = 0  
train_label_counter = [0, 0]
val_label_counter = [0, 0]

train_label_count_max = 5e3 
val_label_count_max = 1e3
total_train = 2 * train_label_count_max
total_val = 2 * val_label_count_max

X_train = []
y_train = []

X_val = []
y_val = []

for line in fs.open('s3://compressed-data-sample/shuffled_train.json'):
    if train_count >= total_train and val_count >= total_val:
        break
    json_file = json.loads(line)
    country = json_file['country']
    label =  int(json_file['country'] in PEACE_COUNTRY)
    
    if not country in MAJOR_COUNTRY:
        if train_label_counter[label] < train_label_count_max :
            sent = json_file['content_cleaned_shuffled']
            ids, msk = regular_encode(sent, tokenizer) # tokenize content_cleaned
            
            X_train.append({'input_ids': ids,'attention_mask':msk})
            y_train.append(json_file['country'])
            train_count += 1
            train_label_counter[label] += 1
            if sum(train_label_counter) % 1e3 == 0:
                print('Train', train_label_counter)
    else:
        if val_label_counter[label] < val_label_count_max :
            sent = json_file['content_cleaned_shuffled']
            ids, msk = regular_encode(sent, tokenizer) # tokenize content_cleaned
            
            X_val.append({'input_ids': ids,'attention_mask':msk})
            y_val.append(json_file['country'])
            val_count += 1
            val_label_counter[label] += 1
            if sum(val_label_counter) % 1e3 == 0:
                print('Val', val_label_counter)

Val [532, 468]
Val [1000, 1000]
Train [240, 760]
Train [481, 1519]
Train [749, 2251]
Train [1028, 2972]
Train [1268, 3732]
Train [1522, 4478]
Train [2000, 5000]
Train [3000, 5000]
Train [4000, 5000]
Train [5000, 5000]


In [11]:
median = statistics.median(list(Counter(y_train).values()))
median

287

## Prepare K-Fold for minority country

In [12]:
X_train = np.array(X_train)
X_val = np.array(X_val)

y_train = np.array(y_train)
y_val = np.array(y_val)

In [13]:
australia_idx = np.where(np.array(y_val) == 'Australia')[0]
india_idx = np.where(np.array(y_val) == 'India')[0]
majority_index = []
for i in range(10):
    australia_sample = np.array(random.sample(list(australia_idx), median))
    india_sample = np.array(random.sample(list(india_idx), median))
    majority_index.append(np.hstack([australia_sample, india_sample]))

In [14]:
eval_results = []
metrics_names = None
    
for maj_index in majority_index:
    tf.keras.backend.clear_session()
    X_all_sample = np.hstack([X_train, X_val[maj_index]])
    y_all_sample = np.hstack([y_train, y_val[maj_index]])
    y_all_sample = np.array(list(map(lambda x: int(x in PEACE_COUNTRY), y_all_sample)))
    
    X_train_fold, X_val_fold, y_train_fold, y_val_fold = train_test_split(X_all_sample, y_all_sample, test_size=0.2)
    
    
    model = get_model()
        
    train_input1 = np.vstack([x['input_ids'] for x in X_train_fold])
    train_input2 = np.vstack([x['attention_mask'] for x in X_train_fold])
    model.fit(x=[train_input1, train_input2], 
              y=np.asarray(y_train_fold),
              epochs = 1, 
              batch_size = 32,
              class_weight={0: 1., 1: 1.})

    eval_input1 = np.vstack([x['input_ids'] for x in X_val_fold])
    eval_input2 = np.vstack([x['attention_mask'] for x in X_val_fold])
    er = model.evaluate(x=[eval_input1, eval_input2], 
                        y=np.asarray(y_val_fold), return_dict=True)
    f1 = 2*er['precision']*er['recall'] / (er['precision']+er['recall'])

    precision_neg = er['TN'] / (er['TN'] + er['FN'])
    recall_neg = er['TN'] / (er['TN']+er['FP'])
    f1_neg = 2 * precision_neg * recall_neg / (precision_neg + recall_neg)

    er = list(er.values())
    er += [f1, precision_neg, recall_neg, f1_neg]
    eval_results.append(er)
    metrics_names = model.metrics_names
    print(er)

    
eval_results = np.array(eval_results)
metrics_names += ['f1', 'precision_neg', 'recall_neg', 'f1_neg']
eval_results = pd.DataFrame(eval_results, columns=metrics_names)
ci = eval_results.apply(lambda x: st.t.interval(0.95, len(x), loc=np.mean(x), scale=st.sem(x)))
ci.index = ['ci_lower', 'ci_upper']
eval_results = pd.concat([eval_results.describe(), ci])
eval_results

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.18971237540245056, 0.9238770604133606, 0.9063948392868042, 0.9421965479850769, 978.0, 976.0, 101.0, 60.0, 0.9239490074091894, 0.9420849420849421, 0.9062209842154132, 0.9238050165641268]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.19686239957809448, 0.9196217656135559, 0.9180952310562134, 0.919847309589386, 964.0, 981.0, 86.0, 84.0, 0.9189704352098627, 0.9211267605633803, 0.9194001874414246, 0.9202626641651032]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.24253001809120178, 0.9073286056518555, 0.9504843950271606, 0.8547918796539307, 883.0, 1036.0, 46.0, 150.0, 0.9001019445076582, 0.8735244519392917, 0.9574861367837338, 0.9135802469135804]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.20869390666484833, 0.9125295281410217, 0.9126505851745605, 0.9026812314987183, 909.0, 1021.0, 87.0, 98.0, 0.9076385337234141, 0.9124218051831993, 0.9214801444043321, 0.9169286035024697]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.1959303617477417, 0.9243499040603638, 0.9126838445663452, 0.938563346862793, 993.0, 962.0, 95.0, 65.0, 0.9254427045180253, 0.9367088607594937, 0.9101229895931883, 0.9232245681381958]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.25072550773620605, 0.8959810733795166, 0.8445512652397156, 0.9759259223937988, 1054.0, 841.0, 194.0, 26.0, 0.9054982706038119, 0.9700115340253749, 0.81256038647343, 0.8843322818086226]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.1947273313999176, 0.9167848825454712, 0.9126760363578796, 0.9213269948959351, 972.0, 967.0, 93.0, 83.0, 0.9169811124208188, 0.920952380952381, 0.9122641509433962, 0.9165876777251185]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.20444567501544952, 0.9210401773452759, 0.9494845271110535, 0.8864292502403259, 921.0, 1027.0, 49.0, 118.0, 0.9168740578831184, 0.896943231441048, 0.9544609665427509, 0.9248086447546151]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.20745012164115906, 0.908274233341217, 0.940464198589325, 0.873477041721344, 932.0, 989.0, 59.0, 135.0, 0.905733733566746, 0.8798932384341637, 0.9437022900763359, 0.9106813996316759]


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[0.2293669879436493, 0.914893627166748, 0.8882042169570923, 0.9500941634178162, 1009.0, 926.0, 127.0, 53.0, 0.9181073665330891, 0.9458631256384066, 0.879392212725546, 0.9114173228346456]


Unnamed: 0,loss,accuracy,precision,recall,TP,TN,FP,FN,f1,precision_neg,recall_neg,f1_neg
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.212044,0.914468,0.913569,0.916533,961.5,972.6,93.7,87.2,0.91393,0.919953,0.911709,0.914563
std,0.021355,0.008796,0.031371,0.03741,51.232043,57.063707,43.212781,38.686202,0.008582,0.030362,0.042176,0.011773
min,0.189712,0.895981,0.844551,0.854792,883.0,841.0,46.0,26.0,0.900102,0.873524,0.81256,0.884332
25%,0.196163,0.909338,0.907959,0.890492,923.75,963.25,65.75,61.25,0.90621,0.900813,0.907196,0.911958
50%,0.205948,0.915839,0.91268,0.920587,968.0,978.5,90.0,83.5,0.916928,0.92104,0.915832,0.916758
75%,0.224199,0.920686,0.934872,0.941288,989.25,1013.0,99.5,113.0,0.918755,0.940741,0.938147,0.922484
max,0.250726,0.92435,0.950484,0.975926,1054.0,1036.0,194.0,150.0,0.925443,0.970012,0.957486,0.924809
ci_lower,0.196997,0.90827,0.891465,0.890174,925.401935,932.392948,63.252302,59.941726,0.907883,0.89856,0.881992,0.906268
ci_upper,0.227092,0.920666,0.935673,0.942893,997.598065,1012.807052,124.147698,114.458274,0.919976,0.941346,0.941426,0.922858


In [17]:
# View results on training data
eval_results = np.array([
    [0.3107, 0.8656, 0.8514, 0.8873, 3770.0000, 3552.0000, 658.0000, 479.0000],
    [0.3373, 0.8497, 0.8198, 0.8974, 3804.0000, 3384.0000, 836.0000, 435.0000],
    [0.3271, 0.8557, 0.8429, 0.8764, 3728.0000, 3510.0000, 695.0000, 526.0000],
    [0.3015, 0.8670, 0.8547, 0.8881, 3801.0000, 3533.0000, 646.0000, 479.0000],
    [0.3183, 0.8635, 0.8488, 0.8844, 3740.0000, 3564.0000, 666.0000, 489.0000],
    [0.3758, 0.8365, 0.8006, 0.8940, 3761.0000, 3315.0000, 937.0000, 446.0000],
    [0.3054, 0.8690, 0.8494, 0.8972, 3797.0000, 3554.0000, 673.0000, 435.0000],
    [0.3014, 0.8665, 0.8553, 0.8837, 3754.0000, 3576.0000, 635.0000, 494.0000],
    [0.3171, 0.8612, 0.8426, 0.8877, 3746.0000, 3539.0000, 700.0000, 474.0000],
    [0.3019, 0.8645, 0.8369, 0.9051, 3824.0000, 3489.0000, 745.0000, 401.0000]
])
eval_results = pd.DataFrame(eval_results, columns=metrics_names[:8])
eval_results['F1'] =  2*eval_results['precision']*eval_results['recall'] / (eval_results['precision']+eval_results['recall'])
eval_results['precision_neg'] = eval_results['TN'] / (eval_results['TN'] + eval_results['FN'])
eval_results['recall_neg'] = eval_results['TN'] / (eval_results['TN']+eval_results['FP'])
eval_results['f1_neg'] = 2 * eval_results['precision_neg'] * eval_results['recall_neg'] / (eval_results['precision_neg'] + eval_results['recall_neg'])

ci = eval_results.apply(lambda x: st.t.interval(0.95, len(x), loc=np.mean(x), scale=st.sem(x)))
ci.index = ['ci_lower', 'ci_upper']
eval_results = pd.concat([eval_results.describe(), ci])
eval_results

Unnamed: 0,loss,accuracy,precision,recall,TP,TN,FP,FN,F1,precision_neg,recall_neg,f1_neg
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.31965,0.85992,0.84024,0.89013,3772.5,3501.6,719.1,465.8,0.864333,0.882668,0.829683,0.855186
std,0.023067,0.010079,0.017418,0.008318,32.104863,85.62087,96.526853,36.303658,0.008552,0.007365,0.022299,0.011938
min,0.3014,0.8365,0.8006,0.8764,3728.0,3315.0,635.0,401.0,0.844726,0.869673,0.779633,0.827405
25%,0.302775,0.857075,0.838325,0.885125,3748.0,3494.25,660.0,437.75,0.860633,0.879664,0.826713,0.853312
50%,0.3139,0.864,0.84585,0.8879,3765.5,3536.0,684.0,476.5,0.867607,0.881293,0.837826,0.859747
75%,0.3249,0.866275,0.8509,0.8964,3800.0,3553.5,733.75,486.5,0.869566,0.885043,0.843417,0.862498
max,0.3758,0.869,0.8553,0.9051,3824.0,3576.0,937.0,526.0,0.872646,0.896915,0.849204,0.865141
ci_lower,0.303397,0.852818,0.827967,0.884269,3749.878934,3441.271589,651.08724,440.220466,0.858307,0.877479,0.81397,0.846774
ci_upper,0.335903,0.867022,0.852513,0.895991,3795.121066,3561.928411,787.11276,491.379534,0.870359,0.887858,0.845395,0.863597
