In [1]:
# Installing Nvidia Apex
! pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" /kaggle/input/nvidia-apex/apex-880ab92

  cmdoptions.check_install_build_global(options)
Created temporary directory: /tmp/pip-ephem-wheel-cache-_ymsh3qe
Created temporary directory: /tmp/pip-req-tracker-68uaoz30
Created requirements tracker '/tmp/pip-req-tracker-68uaoz30'
Created temporary directory: /tmp/pip-install-b8hdcez_
Processing /kaggle/input/nvidia-apex/apex-880ab92
  Created temporary directory: /tmp/pip-req-build-rv247the
  Added file:///kaggle/input/nvidia-apex/apex-880ab92 to build tracker '/tmp/pip-req-tracker-68uaoz30'
    Running setup.py (path:/tmp/pip-req-build-rv247the/setup.py) egg_info for package from file:///kaggle/input/nvidia-apex/apex-880ab92
    Running command python setup.py egg_info
    torch.__version__  =  1.2.0
    running egg_info
    creating pip-egg-info/apex.egg-info
    writing pip-egg-info/apex.egg-info/PKG-INFO
    writing dependency_links to pip-egg-info/apex.egg-info/dependency_links.txt
    writing top-level names to pip-egg-info/apex.egg-info/top_level.txt
    writ

In [2]:
from __future__ import absolute_import, division, print_function

import os, sys, re, gc, pickle, operator, shutil, copy
import time, datetime

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.utils import shuffle

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, Dataset, DataLoader, Sampler

from tqdm import tqdm, tqdm_notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
from apex import amp
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam, BertConfig
from pytorch_pretrained_bert.modeling import BertModel, BertPreTrainedModel
device = torch.device('cuda')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  return f(*args, **kwds)
  return f(*args, **kwds)
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])


In [4]:
WORK_DIR = "./"
DATA_DIR = "/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/"
BERT_MODEL_PATH = '/kaggle/input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
output_model_file = "bert_models.pt"

SEED = 2019

MAX_SEQUENCE_LENGTH = 220
epochs_for_sched = 1
checkpoint_iter = 20000
lr = 2.5e-5
batch_size = 16
warmup_proportion = 0.03
grad_accumulation_steps = 2

identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

aux_columns = ['severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']

label_column = 'target'
pred_column = 'prediction'
text_column = 'comment_text'

In [5]:
# Seed for randomness in pytorch
def seed_torch(seed=SEED):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
# Convert target and identity columns to booleans
def convert_dataframe_to_bool(df):
    def convert_to_bool(df, col_name):
        df[col_name] = np.where(df[col_name] >= 0.5, True, False)

    bool_df = df.copy()
    for col in [label_column] + identity_columns + aux_columns:
        convert_to_bool(bool_df, col)
    return bool_df

In [6]:
# Tokenizing the lines to BERT format
# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
def convert_lines(texts, max_seq_length, tokenizer):
    max_seq_length -= 2
    all_tokens = []

    for text in texts:
        tokens = tokenizer.tokenize(text)
        if len(tokens) > max_seq_length:
            tokens = tokens[-max_seq_length:]
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens+["[SEP]"])
        all_tokens.append(one_token)

    return np.array(all_tokens)

In [7]:
# Prepare dataset and dataloader

class Toxic_comments(Dataset):

    def __init__(self, tokenized_comments, targets=None, split=None, maxlen=256):
        self.comments = tokenized_comments
        self.targets = targets
        self.split = split
        assert self.split in {'train', 'valid', 'test'}
        self.maxlen = maxlen

    def __getitem__(self, index):
        comment = self.comments[index]
        if self.targets is not None:
            target = self.targets[index]
            return comment, torch.FloatTensor(target)
        else:
            return comment

    def __len__(self):
        return len(self.comments)

    def get_lens(self):
        lengths = np.fromiter(
            ((min(self.maxlen, len(seq))) for seq in self.comments),
            dtype=np.int32)
        return lengths

    def collate_fn(self, batch):
        """
        Collate function for sequence bucketing
        Note: this need not be defined in this Class, can be standalone.

        :param batch: an iterable of N sets from __getitem__()
        :return: a tensor of comments, and targets
        """

        if self.split in ('train', 'valid'):
            comments, targets = zip(*batch)
        else:
            comments = batch

        lengths = [len(c) for c in comments]
        maxlen = max(lengths)
        padded_comments = []
        for i, c in enumerate(comments):
            padded_comments.append(c+[0]*(maxlen - lengths[i]))

        if self.split in ('train', 'valid'):
            return torch.LongTensor(padded_comments), torch.stack(targets)
        else:
            return torch.LongTensor(padded_comments)


class BucketSampler(Sampler):

    def __init__(self, data_source, sort_lens, bucket_size=None, batch_size=1024, shuffle_data=True):
        super().__init__(data_source)
        self.shuffle = shuffle_data
        self.batch_size = batch_size
        self.sort_lens = sort_lens
        self.bucket_size = bucket_size if bucket_size is not None else len(sort_lens)
        self.weights = None

        if not shuffle_data:
            self.index = self.prepare_buckets()
        else:
            self.index = None

    def set_weights(self, weights):
        assert weights >= 0
        total = np.sum(weights)
        if total != 1:
            weights = weights / total
        self.weights = weights

    def __iter__(self):
        indices = None
        if self.weights is not None:
            total = len(self.sort_lens)
            indices = np.random.choice(total, (total,), p=self.weights)
        if self.shuffle:
            self.index = self.prepare_buckets(indices)
        return iter(self.index)

    def get_reverse_indexes(self):
        indexes = np.zeros((len(self.index),), dtype=np.int32)
        for i, j in enumerate(self.index):
            indexes[j] = i
        return indexes

    def __len__(self):
        return len(self.sort_lens)

    def prepare_buckets(self, indices=None):
        lengths = - self.sort_lens
        assert self.bucket_size % self.batch_size == 0 or self.bucket_size == len(lengths)

        if indices is None:
            if self.shuffle:
                indices = shuffle(np.arange(len(lengths), dtype=np.int32))
                lengths = lengths[indices]
            else:
                indices = np.arange(len(lengths), dtype=np.int32)

        #  bucket iterator
        def divide_chunks(l, n):
            if n == len(l):
                yield np.arange(len(l), dtype=np.int32), l
            else:
                # looping till length l
                for i in range(0, len(l), n):
                    data = l[i:i + n]
                    yield np.arange(i, i + len(data), dtype=np.int32), data

        new_indices = []
        extra_batch_idx = None
        for chunk_index, chunk in divide_chunks(lengths, self.bucket_size):
            # sort indices in bucket by descending order of length
            indices_sorted = chunk_index[np.argsort(chunk)]

            batch_idxes = []
            for _, batch_idx in divide_chunks(indices_sorted, self.batch_size):
                if len(batch_idx) == self.batch_size:
                    batch_idxes.append(batch_idx.tolist())
                else:
                    assert extra_batch_idx is None
                    assert batch_idx is not None
                    extra_batch_idx = batch_idx.tolist()

            # shuffling batches within buckets
            if self.shuffle:
                batch_idxes = shuffle(batch_idxes)
            for batch_idx in batch_idxes:
                new_indices.extend(batch_idx)

        if extra_batch_idx is not None:
            new_indices.extend(extra_batch_idx)

        if not self.shuffle:
            self.original_indices = np.argsort(indices_sorted).tolist()
        return indices[new_indices]


def prepare_loader(x, y=None, batch_size=1024, split=None):
    assert split in {'train', 'valid', 'test'}
    dataset = Toxic_comments(x, y, split, MAX_SEQUENCE_LENGTH)
    if split == 'train':
        sampler = BucketSampler(dataset, dataset.get_lens(),
                                bucket_size=batch_size*50, batch_size=batch_size)
        return DataLoader(dataset, batch_size=batch_size, sampler=sampler,
                          collate_fn=dataset.collate_fn)
    else:
        sampler = BucketSampler(dataset, dataset.get_lens(),
                                batch_size=batch_size, shuffle_data=False)
        return DataLoader(dataset, batch_size=batch_size, sampler=sampler,
                          collate_fn=dataset.collate_fn), sampler.original_indices

In [8]:
# Metrics

SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, pred_column):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[pred_column])

def compute_bpsn_auc(df, subgroup, label, pred_column):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[pred_column])

def compute_bnsp_auc(df, subgroup, label, pred_column):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[pred_column])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

def calculate_overall_auc(df, pred_col, label_col):
    true_labels = df[label_col]
    predicted_labels = df[pred_col]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)

In [9]:
# Functions for the training process

class NetSolver(object):

    def __init__(self, model, criterion, optimizer, print_freq, filepath):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.print_freq = print_freq
        self.filepath = filepath

        self.model = self.model.to(device)
        self.criterion = self.criterion.to(device)
        self._reset()

    def _reset(self):
        """Set up some book-keeping variables for optimization.
        """
        self.best_val_loss = 1e4
        self.best_val_auc = 0.
        self.loss_history = []
        self.val_loss_history = []
        self.auc_history = []
        self.val_auc_history = []
        self.val_unbias_auc_history = []
        self.val_preds = []
        self.models = {}

    def save_checkpoint(self, iteration):
        """Save model checkpoint.
        """
        self.models[f'ckpt_{iteration}'] = self.model.state_dict()
    
    def save_final_state(self):
        """Save final states.
        """
        state = {'model': self.models,
                 'optimizer': self.optimizer.state_dict()
                 }
        torch.save(state, self.filepath)

    def forward_pass(self, x, y):
        """Forward pass through the network.
        """
        x, y = x.to(device=device, dtype=torch.long), y.to(device=device, dtype=torch.float)
        scores = self.model(x)
        loss = self.criterion(scores, y)
        return loss, torch.sigmoid(scores)

    def train(self, loaders, iterations, val_df, val_original_indices):
        """Weight of network updated by apex, grad accumulation, model checkpoint.
        """
        train_loader, val_loader = loaders
        loader = iter(train_loader)
        running_loss = 0.
        self.optimizer.zero_grad()

        # start training for iterations
        for i in range(iterations):
            self.model.train()

            try:
                x, y = next(loader)
            except:
                loader = iter(train_loader)
                x, y = next(loader)
            loss, _ = self.forward_pass(x, y)
            
#             loss.backward()
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()

            # gradient accumulation for larger batch size effect with smaller memory usage
            if (i+1) % grad_accumulation_steps == 0:   # Wait for several backward steps
                self.optimizer.step()                  # Now we can do an optimizer step
                self.optimizer.zero_grad()

            running_loss += loss.item()
            
            # verbose and checkpoint
            if (i+1) % self.print_freq == 0 or (i+1) == iterations:
                print(f'Iteration {i+1}:')
                train_auc, _ = self.check_auc(train_loader, num_batches=50)
                print('{"metric": "Loss", "value": %.4f}' % (running_loss/(i+1),))
                print('{"metric": "AUC", "value": %.4f}' % (train_auc,))
                
                is_print = False if (i+1) < iterations else True
                val_auc, val_unbias_auc, val_loss, val_scores = self.check_auc(
                    val_loader, df=val_df, idxs=val_original_indices, is_print=is_print)
                print('{"metric": "Val. Loss", "value": %.4f}' % (val_loss,))
                print('{"metric": "Val. AUC", "value": %.4f}' % (val_auc,))
                print('{"metric": "Val. Unbiased AUC", "value": %.4f}' % (val_unbias_auc,))
                
                self.loss_history.append(running_loss/(i+1))
                self.val_loss_history.append(val_loss)
                self.auc_history.append(train_auc)
                self.val_auc_history.append(val_auc)
                self.val_unbias_auc_history.append(val_unbias_auc)
                self.val_preds.append(val_scores)
                
                if val_loss < self.best_val_loss:
                    print('updating best val loss...')
                    self.best_val_loss = val_loss
                if val_unbias_auc > self.best_val_auc:
                    print('updating best val auc...')
                    self.best_val_auc = val_unbias_auc
                    self.save_checkpoint(i+1)
                
                torch.cuda.empty_cache()
                print()
            
        self.save_final_state()

    def check_auc(self, loader, num_batches=None, df=None, idxs=None, is_print=False):
        """Calculate metrics for validation
        """
        self.model.eval()
        targets, scores, losses = [], [], []
        with torch.no_grad():
            for t, (x, y) in enumerate(loader):
                l, score = self.forward_pass(x, y)
                targets.append((y[:,0].cpu().numpy()>=0.5).astype(int))
                scores.append(score[:,0].cpu().numpy())
                losses.append(l.item())
                if num_batches is not None and (t+1) == num_batches:
                    break

        targets = np.concatenate(targets)
        scores = np.concatenate(scores)
        auc = roc_auc_score(targets, scores)
        loss = np.mean(losses)

        if df is not None:
            unbias_auc = check_unbias_auc(df, scores[idxs], is_print)
            return auc, unbias_auc, loss, scores[idxs]

        return auc, loss
    

def check_unbias_auc(df, scores, print_table=False):
    """Calculate metrics for validation
    """
    df[pred_column] = scores
    bias_metrics_df = compute_bias_metrics_for_model(df, identity_columns, pred_column, label_column)
    unbias_auc = get_final_metric(bias_metrics_df, calculate_overall_auc(df, pred_column, label_column))
    if print_table:
        print(bias_metrics_df)
    return unbias_auc

In [10]:
# Classifier
class BertForCustomClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels):
        super(BertForCustomClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropouts = nn.ModuleList([nn.Dropout(config.hidden_dropout_prob) for _ in range(5)])
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                h = self.classifier(dropout(pooled_output))
            else:
                h += self.classifier(dropout(pooled_output))
        return h / len(self.dropouts)

# Loss
class UnbiasLoss(nn.Module):
    def __init__(self, main_loss_weight=3.0):
        super(UnbiasLoss, self).__init__()
        self.alpha = main_loss_weight
    def forward(self, pred_scores, labels):
        main_loss = nn.BCEWithLogitsLoss(weight=labels[:,-1])(pred_scores[:,0], labels[:,0])
        aux_loss = nn.BCEWithLogitsLoss()(pred_scores[:,1:], labels[:,1:-1])
        return self.alpha * main_loss + aux_loss


# Translate model from tensorflow to pytorch
convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
    BERT_MODEL_PATH + 'bert_model.ckpt',
    BERT_MODEL_PATH + 'bert_config.json',
    WORK_DIR + 'pytorch_model.bin')

# Save config files in the same path as pretrained model
# for model reloading (inference, resume training etc.)
shutil.copyfile(BERT_MODEL_PATH + 'bert_config.json', WORK_DIR + 'bert_config.json')

Building PyTorch model from configuration: {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

Converting TensorFlow checkpoint from /kaggle/input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/bert_model.ckpt
Loading TF weight bert/embeddings/LayerNorm/beta with shape [768]
Loading TF weight bert/embeddings/LayerNorm/gamma with shape [768]
Loading TF weight bert/embeddings/position_embeddings with shape [512, 768]
Loading TF weight bert/embeddings/token_type_embeddings with shape [2, 768]
Loading TF weight bert/embeddings/word_embeddings with shape [30522, 768]
Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/beta with shape [768]
Loading TF weight bert/encoder/layer_0/attention/output/L

'./bert_config.json'

In [11]:
def train_val_split(train_x, train_y):
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    cv_indices = [(tr_idx, val_idx) for tr_idx, val_idx in kf.split(train_x, train_y)]
    return cv_indices

def load_and_preproc():
    train_df = pd.read_csv(DATA_DIR+'train.csv')
    train_df[identity_columns] = train_df[identity_columns].copy().fillna(0)

    sample_weights = np.ones(len(train_df))
    sample_weights += train_df[identity_columns].values.sum(1) * 3
    sample_weights += train_df[label_column].values * 8
    sample_weights /= sample_weights.max()
    train_tars = train_df[[label_column]+aux_columns+identity_columns].values
    train_tars = np.hstack([train_tars, sample_weights[:,None]]).astype('float32')

    train_df = convert_dataframe_to_bool(train_df)
    df = train_df[[label_column]+identity_columns].copy()
    df[label_column] = df[label_column].astype('uint8')

    return train_df[text_column], train_tars, df

In [12]:
# %%time

# train_seq, train_tars, trn_df = load_and_preproc()
# cv_indices = train_val_split(train_seq, (train_tars[:,0]>=0.5).astype(int))
# trn_idx, val_idx = cv_indices[0]

# print('tokenizing...')
# t0 = time.time()
# tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, do_lower_case=True)
# train_seq = convert_lines(train_seq[trn_idx[:20000]], MAX_SEQUENCE_LENGTH, tokenizer)
# print('tokenizing complete in {:.0f} seconds.'.format(time.time()-t0))

# x_train, x_val = train_seq[:16000], train_seq[16000:]
# y_train, y_val = train_tars[trn_idx[:16000]], train_tars[trn_idx[16000:20000]]
# train_loader = prepare_loader(x_train, y_train, batch_size, split='train')
# val_loader, val_original_indices = prepare_loader(x_val, y_val, 32, split='valid')
# val_df = trn_df.iloc[trn_idx[16000:20000]]

In [13]:
%%time

train_seq, train_tars, trn_df = load_and_preproc()
cv_indices = train_val_split(train_seq, (train_tars[:,0]>=0.5).astype(int))
trn_idx, val_idx = cv_indices[0]

print('tokenizing...')
t0 = time.time()
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, do_lower_case=True)
train_seq = convert_lines(train_seq, MAX_SEQUENCE_LENGTH, tokenizer)
print('tokenizing complete in {:.0f} seconds.'.format(time.time()-t0))

tokenizing...
tokenizing complete in 2095 seconds.
CPU times: user 35min 3s, sys: 6.62 s, total: 35min 9s
Wall time: 35min 11s


In [14]:
# training preparation

x_train, x_val = train_seq[trn_idx], train_seq[val_idx]
y_train, y_val = train_tars[trn_idx], train_tars[val_idx]
train_loader = prepare_loader(x_train, y_train, batch_size, split='train')
val_loader, val_original_indices = prepare_loader(x_val, y_val, 32, split='valid')
val_df = trn_df.iloc[val_idx]

In [15]:
import gc
del train_seq, train_tars, trn_df
gc.collect()

0

In [16]:
print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
print(str(torch.cuda.memory_cached(device)/1000000 ) + 'M')

0.0M
0.0M


In [17]:
seed_torch(SEED)
torch.cuda.empty_cache()

print("Building model...")
pre_model = BertForSequenceClassification.from_pretrained('../working', num_labels=16)
bert_config = BertConfig(WORK_DIR + 'bert_config.json')
model = BertForCustomClassification(bert_config, num_labels=16)
model.bert = copy.deepcopy(pre_model.bert)

del pre_model
gc.collect()

num_train_steps = int(epochs_for_sched * len(train_loader) / grad_accumulation_steps)
optimizer = BertAdam(model.parameters(),
                     lr=lr,
                     warmup=warmup_proportion,
                     t_total=num_train_steps)

model = model.to(device)
criterion = UnbiasLoss().to(device)
model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
solver = NetSolver(model, criterion, optimizer, checkpoint_iter, 'epk_1_'+output_model_file)

Building model...


0

In [18]:
print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
print(str(torch.cuda.memory_cached(device)/1000000 ) + 'M')

439.112704M
494.927872M


In [19]:
n_iter = num_train_steps * grad_accumulation_steps
solver.train((train_loader, val_loader), n_iter, val_df, val_original_indices)

Iteration 20000:
{"metric": "Loss", "value": 0.1619}
{"metric": "AUC", "value": 0.9542}
{"metric": "Val. Loss", "value": 0.1416}
{"metric": "Val. AUC", "value": 0.9641}
{"metric": "Val. Unbiased AUC", "value": 0.9290}
updating best val loss...
updating best val auc...

Iteration 40000:
{"metric": "Loss", "value": 0.1516}
{"metric": "AUC", "value": 0.9751}
{"metric": "Val. Loss", "value": 0.1401}
{"metric": "Val. AUC", "value": 0.9687}
{"metric": "Val. Unbiased AUC", "value": 0.9309}
updating best val loss...
updating best val auc...

Iteration 60000:
{"metric": "Loss", "value": 0.1477}
{"metric": "AUC", "value": 0.9618}
{"metric": "Val. Loss", "value": 0.1392}
{"metric": "Val. AUC", "value": 0.9698}
{"metric": "Val. Unbiased AUC", "value": 0.9383}
updating best val loss...
updating best val auc...

Iteration 80000:
{"metric": "Loss", "value": 0.1454}
{"metric": "AUC", "value": 0.9728}
{"metric": "Val. Loss", "value": 0.1382}
{"metric": "Val. AUC", "value": 0.9715}
{"metric": "Val. Unbi

In [20]:
print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
print(str(torch.cuda.memory_cached(device)/1000000 ) + 'M')

1979.936256M
2283.798528M


In [21]:
torch.cuda.empty_cache()

ckpt_weights = [2**e for e in range(len(solver.val_preds))]
val_preds = np.average(solver.val_preds, weights=ckpt_weights, axis=0)
val_unbias_auc = check_unbias_auc(val_df, val_preds, True)
print('{"metric": "Ckpt Val. Unbiased AUC", "value": %.4f}' % (val_unbias_auc,))

                        subgroup  subgroup_size  subgroup_auc  bpsn_auc  \
7                          white           2546      0.862757  0.895751   
6                          black           1508      0.864806  0.883130   
2      homosexual_gay_or_lesbian           1077      0.868429  0.893843   
5                         muslim           2091      0.890610  0.916314   
4                         jewish            750      0.917264  0.947738   
0                           male           4558      0.932948  0.954334   
1                         female           5465      0.936419  0.961837   
3                      christian           3974      0.943839  0.970238   
8  psychiatric_or_mental_illness            499      0.950060  0.947261   

   bnsp_auc  
7  0.967120  
6  0.970671  
2  0.967928  
5  0.965968  
4  0.956911  
0  0.961451  
1  0.957118  
3  0.950196  
8  0.973686  
{"metric": "Ckpt Val. Unbiased AUC", "value": 0.9415}
