In [1]:
import numpy as np
import pandas as pd
import ujson as json
import s3fs
import boto3
import io
import tarfile
import time
import gc

In [2]:
fs = s3fs.S3FileSystem()
MAX_LEN = 128
PEACE_COUNTRY = set(['Australia', 'New Zealand', 
                 'Belgium', 'Sweden', 'Denmark', 
                 'Norway', 'Finland', 'Czech Republic', 
                 'Netherlands', 'Austria'])

## Load in model

In [3]:
! pip install transformers
! pip install tensorflow
import tensorflow as tf
from transformers import BertConfig, AutoTokenizer, TFAutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('./saved_roberta')
bert_model = TFAutoModelForSequenceClassification.from_pretrained('./saved_roberta')

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)


[?25l[K     |                                | 10 kB 47.2 MB/s eta 0:00:01[K     |▏                               | 20 kB 40.9 MB/s eta 0:00:01[K     |▎                               | 30 kB 46.7 MB/s eta 0:00:01[K     |▍                               | 40 kB 51.1 MB/s eta 0:00:01[K     |▌                               | 51 kB 35.1 MB/s eta 0:00:01[K     |▋                               | 61 kB 37.9 MB/s eta 0:00:01[K     |▊                               | 71 kB 31.7 MB/s eta 0:00:01[K     |▉                               | 81 kB 34.5 MB/s eta 0:00:01[K     |█                               | 92 kB 36.4 MB/s eta 0:00:01[K     |█                               | 102 kB 36.4 MB/s eta 0:00:01[K     |█▏                              | 112 kB 36.4 MB/s eta 0:00:01[K     |█▎                              | 122 kB 36.4 MB/s eta 0:00:01[K     |█▍                              | 133 kB 36.4 MB/s eta 0:00:01[K     |█▌                              | 143 kB 36.4 MB/s eta 0:

[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 110.4 MB/s eta 0:00:01
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 13.3 MB/s  eta 0:00:01
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 102.3 MB/s eta 0:00:01
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.1.2 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.12.3
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/amazonei_tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m
Collecting numpy<1.19.0,>=1.16.0
  Downloading numpy-1.18.5-cp36-cp36m-manylinux1_x86_64

Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.5
    Uninstalling numpy-1.19.5:
      Successfully uninstalled numpy-1.19.5
Successfully installed numpy-1.18.5
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/amazonei_tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at ./saved_roberta.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [5]:
input_ids_in = tf.keras.layers.Input(shape=(MAX_LEN,), name='input_ids', dtype='int32')
input_masks_ids_in = tf.keras.layers.Input(shape=(MAX_LEN,), name='attention_mask', dtype='int32')
X = bert_model(input_ids_in, input_masks_ids_in)[0]
output_layer = tf.keras.layers.Activation(activation='sigmoid')(X)
model = tf.keras.Model(inputs=[input_ids_in, input_masks_ids_in], outputs = output_layer)

## Build input pipeline

In [12]:
def regular_encode(texts, tokenizer, maxlen=MAX_LEN):
    """
    Function to encode the word
    """
    # encode the word to vector of integer
    enc_di = tokenizer.encode_plus(
        texts, 
        return_attention_mask=True, 
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        max_length=maxlen)
    
    return [np.array(enc_di['input_ids']), np.array(enc_di['attention_mask'])]

def read_csv(file_path, ds_type, minor=False):
    for idx, line in enumerate(fs.open(file_path)):
        if (ds_type =='train' and idx >= 8e4) or (ds_type =='test' and idx >= 1e4):
            return
        json_file = json.loads(line)
        if minor and json_file['country'] in ['India', 'Australia']:
            continue
        ids, msk = regular_encode(json_file['content_cleaned'], tokenizer) # tokenize content_cleaned
        yield {'input_ids': ids,'attention_mask':msk}, int(json_file['country'] in PEACE_COUNTRY)
        


# Get training set
train_ds = tf.data.Dataset.from_generator(lambda : read_csv('s3://compressed-data-sample/processed_train.json', ds_type='train'),
                                          ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int16)).batch(1)
# Get testing set
test_ds = tf.data.Dataset.from_generator(lambda : read_csv('s3://compressed-data-sample/processed_test.json', ds_type='test'),
                                         ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int16)).batch(1)

test_minor_ds = tf.data.Dataset.from_generator(lambda : read_csv('s3://compressed-data-sample/processed_test.json', ds_type='test', minor=True),
                                         ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int16)).batch(1)

In [13]:
# Get label
y_train = np.concatenate([y for x, y in train_ds], axis=0) # Get train label
y_test = np.concatenate([y for x, y in test_ds], axis=0) # Get test label
y_test_minor = np.concatenate([y for x, y in test_minor_ds], axis=0) # Get test label

## Get Prediction

In [14]:
# Construct embedding
start_time = time.time()
pred_train = (model.predict(train_ds, use_multiprocessing=True).reshape(-1) > 0.5).astype(int) # Get train pred
pred_test = (model.predict(test_ds, use_multiprocessing=True).reshape(-1) > 0.5).astype(int) # Get test pred
pred_test_minor = (model.predict(test_minor_ds, use_multiprocessing=True).reshape(-1) > 0.5).astype(int) # Get test_minor pred

end_time = time.time()
gc.collect()
print(f'Prediction Time: {end_time-start_time} seconds')

Prediction Time: 1924.530820608139 seconds


## View Evaluation Details on Train

In [15]:
tn = np.logical_and(pred_train == 0, y_train == 0).sum()
fn = np.logical_and(pred_train == 0, y_train == 1).sum()
tp = np.logical_and(pred_train == 1, y_train == 1).sum()
fp = np.logical_and(pred_train == 1, y_train == 0).sum()
p = (tp+tn)/(tp+tn+fp+fn)*100

print("\n{0:<20}{1:<4.1f}%\n".format("Overall Train Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "NonPeace", "Peace"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("NonPeace", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Peace", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Train Classification Rate: 98.4%

Predicted      NonPeace          Peace
Observed
NonPeace       98% (39335)     1% (580)
Peace           2% (705)     99% (39380) 



## View Evaluation Details on Test

In [19]:
tn = np.logical_and(pred_test == 0, y_test == 0).sum()
fn = np.logical_and(pred_test == 0, y_test == 1).sum()
tp = np.logical_and(pred_test == 1, y_test == 1).sum()
fp = np.logical_and(pred_test == 1, y_test == 0).sum()
p = (tp+tn)/(tp+tn+fp+fn)*100

print("\n{0:<20}{1:<4.1f}%\n".format("Overall Test Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "NonPeace", "Peace"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("NonPeace", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Peace", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Test Classification Rate: 98.2%

Predicted      NonPeace          Peace
Observed
NonPeace       98% (5001)     2% (77)
Peace           2% (108)     98% (4814) 



## View Evaluation Details on Test (Remove India & Australia)

In [20]:
tn = np.logical_and(pred_test_minor == 0, y_test_minor == 0).sum()
fn = np.logical_and(pred_test_minor == 0, y_test_minor == 1).sum()
tp = np.logical_and(pred_test_minor == 1, y_test_minor == 1).sum()
fp = np.logical_and(pred_test_minor == 1, y_test_minor == 0).sum()
p = (tp+tn)/(tp+tn+fp+fn)*100

print("\n{0:<20}{1:<4.1f}%\n".format("Overall Test (Minority Group) Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "NonPeace", "Peace"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("NonPeace", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Peace", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Test (Minority Group) Classification Rate: 95.7%

Predicted      NonPeace          Peace
Observed
NonPeace       90% (419)     2% (21)
Peace           10% (49)     98% (1144) 



In [21]:
precision = tp /(tp+fp)
recall = tp /(tp+fn)
print(f'Precision: {100*precision:.4f} %')
print(f'Recall: {100*recall:.4f} %')
print(f'F1: {100*2*precision*recall/(precision+recall):.4f} %' )

Precision: 98.1974 %
Recall: 95.8927 %
F1: 97.0314 %
