In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import re
import os, sys
import datetime, time
import gc, operator 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import pkg_resources

import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from config import SEED, PARTIAL_TRAIN, TEST_SIZE, NUM_LABELS
from config import MAX_SEQUENCE_LENGTH, NUM_EPOCH, LEARNING_RATE, BATCH_SIZE
from config import ACCUMULATION_STEPS, INPUT_DIR, WORK_DIR, TOXICITY_COLUMN, DATA_DIR
from config import BERT_MODEL_NAME, FINE_TUNED_MODEL_PATH

from utils import set_seed, convert_lines_onfly, preprocess
from utils import calculate_overall_auc, compute_bias_metrics_for_model, get_final_metric

from transformers import BertTokenizer, BertForSequenceClassification, AdamW


device = torch.device('cuda')
TOXICITY_COLUMN = "target"

In [12]:
set_seed(SEED)

In [13]:
%%time

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
train_df = train_df.sample(frac=PARTIAL_TRAIN, random_state=SEED)
print('loaded %d records' % len(train_df))

test_df = train_df.tail(TEST_SIZE)
train_df = train_df.head(((train_df.shape[0]-TEST_SIZE)//BATCH_SIZE)*BATCH_SIZE)

# Make sure all comment_text values are strings
sentences = preprocess(train_df['comment_text'].astype(str).fillna("DUMMY_VALUE")).values 
train_df = train_df.fillna(0)

# List all identities
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
y_columns = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat'] + identity_columns
identity_sp = [ 'homosexual_gay_or_lesbian','muslim', 'black', 'white']

# Convert taget and identity columns to booleans
train_df = train_df.drop(['comment_text'],axis=1)

loaded 902437 records
CPU times: user 26.4 s, sys: 416 ms, total: 26.8 s
Wall time: 26.8 s


In [14]:
NUM_LABELS = len(y_columns)

In [15]:
def calculate_weights(train):
    has_identity = torch.sigmoid(10*(torch.tensor((train[identity_columns].fillna(0).max(axis=1)).values)-0.4))
    has_target = torch.sigmoid(10*(torch.tensor(train['target'].values)-0.4))
    weights = (torch.ones(train.shape[0],dtype=torch.float64)+has_identity+has_identity*(1-has_target)+has_target*(1-has_identity)) / 4
    weights = weights.to(dtype=torch.float32)
    return weights

In [16]:
# make sure every batch has similar sentences length, and shuffle the batchs
sort_idx = np.argsort(np.array([len(x.split()) for x in sentences])).reshape(train_df.shape[0]//BATCH_SIZE,BATCH_SIZE)
np.random.shuffle(sort_idx)
sort_idx = sort_idx.reshape(train_df.shape[0])

In [17]:
%%time
sentences = sentences[sort_idx]
X = sentences                #[train_df.idx]
y = train_df[y_columns].values[sort_idx]

CPU times: user 129 ms, sys: 12.1 ms, total: 141 ms
Wall time: 140 ms


In [18]:
weights_tensor = calculate_weights(train_df)[sort_idx].repeat(len(y_columns),1).transpose(0,1)
weights_tensor[:,0] = weights_tensor[:,0] * (len(y_columns))/4
weights_tensor[:,6:] = weights_tensor[:,6:] * 0.25

In [19]:
train = torch.utils.data.TensorDataset(torch.arange(len(X)), torch.tensor((np.abs(2.0*y-1.0)**0.5*np.sign(y-0.5)+1)/2,dtype=torch.float), weights_tensor)

In [20]:
torch.cuda.empty_cache()

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

model = BertForSequenceClassification.from_pretrained(BERT_MODEL_NAME, num_labels=NUM_LABELS)

model.zero_grad()
_ = model.cuda()

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

avg_val_loss = 0.
avg_val_accuracy = 0.
num_train_optimization_steps = int(NUM_EPOCH*len(train)/BATCH_SIZE/ACCUMULATION_STEPS)

optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
_ = model.train()
tq = tqdm(range(NUM_EPOCH))
for epoch in tq:

    torch.cuda.empty_cache()
    train_loader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=False)

    avg_loss = 0.
    avg_accuracy = 0.
    lossf = None
    optimizer.zero_grad()
    tk0 = tqdm(train_loader,leave = False)
    for i , (ind_batch, y_batch, w_batch) in enumerate(tk0):
        ind_batch.requires_grad = False
        x_batch=torch.tensor(convert_lines_onfly(X[ind_batch.numpy()], MAX_SEQUENCE_LENGTH, tokenizer))
        y_pred = model(x_batch.to(device)).logits
        loss =  F.binary_cross_entropy_with_logits(y_pred, y_batch.to(device), weight=w_batch.to(device)) / ACCUMULATION_STEPS

        loss.backward()
        if (i+1) % ACCUMULATION_STEPS == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
            optimizer.zero_grad()

        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()*ACCUMULATION_STEPS
        else:
            lossf = loss.item()
        tk0.set_postfix(loss = lossf)
        avg_loss += loss.item()*ACCUMULATION_STEPS / len(train_loader)
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
    tq.set_postfix(avg_val_loss=avg_val_loss,avg_val_accuracy=avg_val_accuracy,avg_loss=avg_loss,avg_accuracy=avg_accuracy)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/50152 [00:00<?, ?it/s]

In [14]:
torch.save(model.state_dict(), FINE_TUNED_MODEL_PATH)

In [21]:
# Run validation
# The following 2 lines are not needed but show how to download the model for prediction
sentences = preprocess(test_df['comment_text'].astype(str).fillna("DUMMY_VALUE")).values
test_df=test_df.fillna(0)
sort_idx=np.flip(np.argsort(np.array([len(x.split()) for x in sentences])))
org_idx=np.argsort(sort_idx)
X = sentences[sort_idx]
test_preds = torch.zeros((len(X)))
x_test = torch.arange(len(X))
test = torch.utils.data.TensorDataset(x_test)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL_NAME, num_labels=NUM_LABELS)
model.load_state_dict(torch.load(FINE_TUNED_MODEL_PATH))
for p in model.parameters():
    p.requires_grad = False

_ = model.cuda()
_ = model.eval()

torch.cuda.empty_cache()
test_loader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE, shuffle=False)
tk0 = tqdm(test_loader,leave=False)
tranct = 0
for i, (ind_batch,) in enumerate(tk0):
    x_batch=torch.tensor(convert_lines_onfly(X[ind_batch.numpy()], MAX_SEQUENCE_LENGTH, tokenizer))
    y_pred = model(x_batch.to(device)).logits
    test_preds[i * BATCH_SIZE:(i+1) * BATCH_SIZE] = test_preds[i * BATCH_SIZE:(i+1) * BATCH_SIZE]+torch.sigmoid(y_pred[:, 0].cpu())
    tranct = tranct + BATCH_SIZE * (x_batch.shape[1] == MAX_SEQUENCE_LENGTH)
    tk0.set_postfix(trunct=tranct,gpu_memory=torch.cuda.memory_allocated() // 1024 ** 2,batch_len=x_batch.shape[1])
    
MODEL_NAME = 'model1'
test_df[MODEL_NAME]=torch.sigmoid(torch.tensor(test_preds[org_idx])).numpy()
TOXICITY_COLUMN = 'target'
bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns, MODEL_NAME, 'target')
bias_metrics_df
get_final_metric(bias_metrics_df, calculate_overall_auc(test_df, MODEL_NAME))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

  0%|          | 0/6250 [00:00<?, ?it/s]

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,595,0.852852,0.902126,0.969667
6,black,715,0.882556,0.911155,0.972392
7,white,1269,0.894304,0.913037,0.975115
5,muslim,1085,0.905086,0.938026,0.968383
4,jewish,379,0.911916,0.948676,0.96378
8,psychiatric_or_mental_illness,223,0.936339,0.948994,0.972985
1,female,2727,0.942418,0.970372,0.961011
3,christian,1962,0.957535,0.97428,0.96156
0,male,2104,0.957539,0.964077,0.972543


0.9492865378389379