In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf 
import transformers
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import re
import emoji
import gc

In [6]:
## constants
val_data_path = "./train/validation_data.csv"
more_toxic_key = "more_toxic"
less_toxic_key = "less_toxic"
batch_size = 32
max_length = 512 # override
output_path = "./output/scores/val_scores_%s_%s.csv" # model name and more info

In [4]:
val_data_df = pd.read_csv(val_data_path)
val_data_df.shape

(30108, 3)

In [6]:
def calculateAcc(preds_less_toxic, preds_more_toxic):
    accuracy = np.sum((preds_more_toxic > preds_less_toxic))/preds_more_toxic.shape[0]
    return accuracy

In [6]:
class CivilDataGenerator(tf.keras.utils.Sequence): # could optimize more like BucketIterator for padding
    def __init__(self, texts, scores, tokenizer, batch_size=batch_size, shuffle=True, include_targets=True, max_length=max_length): # texts -> numpy array
        self.texts = texts
        self.scores = scores
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.include_targets = include_targets
        self.max_length = max_length
        # Load our BERT Tokenizer to encode the text.
        self.tokenizer =  tokenizer # 
        self.indexes = np.arange(len(self.texts))
        self.on_epoch_end()
        
    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.texts) // self.batch_size + 1 if (len(self.texts) % self.batch_size) != 0 else 0
    
    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)
            
    def __getitem__(self, idx): # idx -> index batch
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        texts = self.texts[indexes]
        
        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            texts.tolist(), # num
            add_special_tokens=True, # not really needed in our case. 
            max_length=self.max_length, # bert has 512 max length # providing our own
            return_attention_mask=True, # need bcos to pad to max length
            return_token_type_ids=False, # not needed # needed when u have two sentences
            padding='max_length', #pad_to_max_length=True, # needed
            return_tensors="tf",
            truncation=True,
        )
        
        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        
        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            scores = np.array(self.scores[indexes], dtype="float32")
            return [input_ids, attention_masks], scores
        else:
            return [input_ids, attention_masks]
        

## BERT + 2 BiLSTM 

In [7]:
## constants 
load_model_path = "./output/bert-2-bilstm-fine-tuning/bert/final_model" # after 2nd epoch
max_length = 350
bert_path =  "bert-base-uncased"
model_name = "bert-2-BiLstm"
more_info = "2-epoch"

In [8]:
tokenizer = transformers.BertTokenizer.from_pretrained(bert_path, do_lower_case=True)

In [9]:
more_toxic_data = CivilDataGenerator(
    val_data_df[more_toxic_key].values,
    None, # no target while inferring
    tokenizer,
    batch_size=batch_size,
    shuffle=False,
    include_targets=False, # added for inference
    max_length=max_length
)

less_toxic_data = CivilDataGenerator(
    val_data_df[less_toxic_key].values,
    None, # no target while inferring
    tokenizer,
    batch_size=batch_size,
    shuffle=False,
    include_targets=False, # added for inference
    max_length=max_length
)

In [10]:
model = tf.keras.models.load_model(load_model_path)

2022-02-03 15:30:15.032914: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-03 15:30:15.102771: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-03 15:30:15.103136: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-03 15:30:15.104573: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow wi

In [11]:
predict_on = [less_toxic_data, more_toxic_data]
preds = np.zeros((val_data_df.shape[0], len(predict_on))) # preds
with tf.device('/device:GPU:0'):
    for i, data_iter in enumerate(predict_on):
        preds[:, i] = model.predict(
            data_iter,
            use_multiprocessing=True, # can only be used when x, y are generators
            workers=-1,
            verbose=1
        ).squeeze(1)

preds.shape

2022-02-03 15:30:33.671083: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-02-03 15:30:36.299397: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-02-03 15:30:37.690223: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8201




(30108, 2)

In [13]:
print("BERT + 2 BiLSTM ", calculateAcc(preds[:, 0], preds[:, 1])) # BERT + 2 BiLSTM  0.6878902617244587 # BERT + 2 BiLSTM + fine tune  0.6896505912049954

BERT + 2 BiLSTM  0.6896505912049954


In [27]:
pd.DataFrame({less_toxic_key: preds[:, 0], more_toxic_key: preds[:, 1] }).to_csv(output_path%(model_name, more_info), index=False)

In [None]:
del model, preds
gc.collect()

## BERT + 1 BiLSTM

In [28]:
## constants 
load_model_path = "./output/bert-1-bilstm/bert/final_model" # after 2nd epoch
max_length = 350
bert_path =  "bert-base-uncased"
model_name = "bert-1-BiLstm"
more_info = "2-epoch"

In [29]:
model = tf.keras.models.load_model(load_model_path)

In [30]:
#using the same iter as above
predict_on = [less_toxic_data, more_toxic_data]
preds = np.zeros((val_data_df.shape[0], len(predict_on))) # preds
with tf.device('/device:GPU:0'):
    for i, data_iter in enumerate(predict_on):
        preds[:, i] = model.predict(
            data_iter,
            use_multiprocessing=True, # can only be used when x, y are generators
            workers=-1,
            verbose=1
        ).squeeze(1)

preds.shape



(30108, 2)

In [31]:
print("BERT + 1 BiLSTM ", calculateAcc(preds[:, 0], preds[:, 1])) # BERT + 1 BiLSTM  0.6840042513617643

BERT + 1 BiLSTM  0.6840042513617643


In [32]:
pd.DataFrame({less_toxic_key: preds[:, 0], more_toxic_key: preds[:, 1] }).to_csv(output_path%(model_name, more_info), index=False)

In [None]:
del model, preds
gc.collect()

## UnitaryAI Detoxify

In [1]:
from unitaryAI.detoxify import Detoxify
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [2]:
## constants 
model_info_dict = {
    'original' : {
        "checkpoint":"./unitaryAI/toxic_original-c1212f89.ckpt", "huggingface_config_path": "./unitaryAI/bert-base-uncased"
    },
    'unbiased' : {
        "checkpoint":"./unitaryAI/toxic_debiased-c7548aa0.ckpt", "huggingface_config_path":"./unitaryAI/roberta-base"
    },
    "multilingual" : {
        "checkpoint":"./unitaryAI/multilingual_debiased-0b549669.ckpt","huggingface_config_path": "./unitaryAI/xlm-roberta-base"
    }
}

test_data_path = "./train/comments_to_score.csv"
val_data_path = "./train/validation_data.csv"
comment_key = "text"
comment_id_key = "comment_id"
batch_size = 32

model_names = model_info_dict.keys()
more_infos = ["pretrained_only"]*len(model_info_dict)
dense_dim = 768 

In [3]:
class CustomDataset(Dataset):
    def __init__(self, comments, targets, include_target=True):
        self.comments = comments
        self.targets = targets
        self.include_target = include_target
    
    def __len__(self):
        return self.comments.shape[0]
    
    def __getitem__(self, idx):
        comment = self.comments[idx]
        if self.include_target == True:
            return comment, self.targets[idx]
        else:
            return comment


In [7]:
less_toxic_dataset = CustomDataset(val_data_df[less_toxic_key].values, None, include_target=False)
more_toxic_dataset = CustomDataset(val_data_df[more_toxic_key].values, None, include_target=False)

less_toxic_dataloader = DataLoader(less_toxic_dataset, batch_size=batch_size, shuffle=False)
more_toxic_dataloader = DataLoader(more_toxic_dataset, batch_size=batch_size, shuffle=False)

In [8]:
models = []
for i, model_name in enumerate(model_info_dict):
    model = Detoxify(model_name, checkpoint=model_info_dict[model_name]["checkpoint"], huggingface_config_path=model_info_dict[model_name]["huggingface_config_path"])
    models.append(model)

BertForSequenceClassification
RobertaForSequenceClassification
XLMRobertaForSequenceClassification


In [9]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach().cpu().numpy()
    return hook

In [16]:
def getScore(examples_lenght, model_info_dict, test_dataloader, models):
    preds = np.zeros((examples_lenght, len(model_info_dict))) # (examples, model)
    dense_output = np.zeros((examples_lenght, len(model_info_dict), dense_dim)) # (examples, model, dense_dim)
    for i, model_name in enumerate(model_info_dict):
        model = models[i]# Detoxify(model_name, checkpoint=model_info_dict[model_name]["checkpoint"], huggingface_config_path=model_info_dict[model_name]["huggingface_config_path"])
        lastidx=0
        if model_name == "original":
            model.model.bert.pooler.dense.register_forward_hook(get_activation("dense"))
        else:
            model.model.classifier.dense.register_forward_hook(get_activation("dense"))
        
        
        for texts in tqdm(test_dataloader):
            preds_dict = model.predict(texts)
            dense_output[lastidx: lastidx+len(texts), i, :] = activation["dense"]
            # print(activation["dense"])

            # print(preds_dict)
            for key in preds_dict:
                preds[lastidx: lastidx+len(texts), i]+=preds_dict[key]
            
            lastidx+=len(texts)
    
    return preds, dense_output

In [17]:
predict_on = [less_toxic_dataloader, more_toxic_dataloader]
preds = np.zeros((len(predict_on), len(more_toxic_dataset), len(model_info_dict))) # (dataloaders, examples, models)
dense_output = np.zeros((len(predict_on), len(more_toxic_dataset), len(model_info_dict), dense_dim)) # (dataloaders, examples, models, dense_dim)
for i, m_data_loader in enumerate(predict_on):
    preds[i, :, :], dense_output[i, :, :, :]   = getScore(len(more_toxic_dataset), model_info_dict, m_data_loader, models)

preds.shape, dense_output.shape

100%|██████████| 941/941 [04:14<00:00,  3.70it/s]
100%|██████████| 941/941 [03:58<00:00,  3.95it/s]
100%|██████████| 941/941 [04:00<00:00,  3.92it/s]
100%|██████████| 941/941 [04:13<00:00,  3.72it/s]
100%|██████████| 941/941 [03:56<00:00,  3.97it/s]
100%|██████████| 941/941 [04:00<00:00,  3.92it/s]


((2, 30108, 3), (2, 30108, 3, 768))

In [18]:
with open('./output/unitaryAI_validation_data_dense_output.npy', 'wb') as f:
    np.save(f, dense_output) # (dataloaders, examples, models, dense_dim) # 1 gb

In [15]:
for i, model_name in enumerate(model_names):
    print(model_name, calculateAcc(preds[0, :, i], preds[1, :, i]))

#original 0.6933373189849874
# unbiased 0.6997143616314601
# multilingual 0.6979208183871396

original 0.6933373189849874
unbiased 0.6997143616314601
multilingual 0.6979208183871396


In [16]:
for i, model_name in enumerate(model_names):
    pd.DataFrame({less_toxic_key: preds[0, :, i], more_toxic_key: preds[1, :, i] }).to_csv(output_path%(model_name, more_infos[i]), index=False)

## BiLSTM | ruddit only

In [1]:
## constants
model_name = "2-bilstm"
more_info = "ruddit-only"
less_toxic_path = "./train/less_toxic_bilstm.csv"
more_toxic_path = "./train/more_toxic_bilstm.csv"

In [3]:
less_toxic_preds = pd.read_csv(less_toxic_path)
more_toxic_preds = pd.read_csv(more_toxic_path)

In [4]:
less_toxic_preds = less_toxic_preds.sort_values(by=["comment_id"])
more_toxic_preds = more_toxic_preds.sort_values(by=["comment_id"])

In [17]:
less_toxic_preds, more_toxic_preds

(       comment_id     score
 4941            0  0.534476
 17967           1  0.496860
 14730           2  0.504504
 17176           3  0.518141
 3278            4  0.522528
 ...           ...       ...
 19422       30103  0.529898
 4279        30104  0.536020
 20400       30105  0.518016
 26648       30106  0.503946
 3179        30107  0.501173
 
 [30108 rows x 2 columns],
        comment_id     score
 10187           0  0.483939
 19806           1  0.481741
 24447           2  0.533612
 1532            3  0.521289
 25733           4  0.529794
 ...           ...       ...
 20721       30103  0.507247
 26107       30104  0.519268
 7781        30105  0.585937
 5032        30106  0.593413
 9619        30107  0.567748
 
 [30108 rows x 2 columns])

In [13]:
print("2 bilstm ruddit only ", calculateAcc(less_toxic_preds["score"].values, more_toxic_preds["score"].values)) # 0.6350139497807892

2 bilstm ruddit only  0.6350139497807892


In [19]:
## when saving different list use value # if both columns are series, it will join by index!!! (not expected) 
pd.DataFrame({less_toxic_key: less_toxic_preds["score"].values, more_toxic_key: more_toxic_preds["score"].values }).to_csv(output_path%(model_name, more_info), index=False)