In [1]:
from __future__ import absolute_import, division, print_function

import os, sys, re, gc, pickle, operator, shutil
import time, datetime

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.utils import shuffle

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, Dataset, DataLoader, Sampler

from tqdm import tqdm, tqdm_notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam, BertConfig
from pytorch_pretrained_bert.modeling import BertModel, BertPreTrainedModel
device = torch.device('cuda')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  return f(*args, **kwds)
  return f(*args, **kwds)
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])


In [3]:
WORK_DIR = "./"
DATA_DIR = "../input/jigsaw-unintended-bias-in-toxicity-classification/"
BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
CONFIG_PATH = '../input/jgs-bert-v1-it1/'
MODEL_PATH = '../input/jgs-bert-v1-it1/'

SEED = 2019

MAX_SEQUENCE_LENGTH = 220
batch_size = 16

identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

aux_columns = ['severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']

label_column = 'target'
pred_column = 'prediction'
text_column = 'comment_text'

In [4]:
# Seed for randomness in pytorch
def seed_torch(seed=SEED):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
# Convert target and identity columns to booleans
def convert_dataframe_to_bool(df):
    def convert_to_bool(df, col_name):
        df[col_name] = np.where(df[col_name] >= 0.5, True, False)

    bool_df = df.copy()
    for col in [label_column] + identity_columns + aux_columns:
        convert_to_bool(bool_df, col)
    return bool_df

In [5]:
# Tokenizing the lines to BERT format
# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
def convert_lines(texts, max_seq_length, tokenizer):
    max_seq_length -= 2
    all_tokens = []

    for text in texts:
        tokens = tokenizer.tokenize(text)
        if len(tokens) > max_seq_length:
            tokens = tokens[-max_seq_length:]
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens+["[SEP]"])
        all_tokens.append(one_token)

    return np.array(all_tokens)

In [6]:
# Prepare dataset and dataloader

class Toxic_comments(Dataset):

    def __init__(self, tokenized_comments, targets=None, split=None, maxlen=256):
        self.comments = tokenized_comments
        self.targets = targets
        self.split = split
        assert self.split in {'train', 'valid', 'test'}
        self.maxlen = maxlen

    def __getitem__(self, index):
        comment = self.comments[index]
        if self.targets is not None:
            target = self.targets[index]
            return comment, torch.FloatTensor(target)
        else:
            return comment

    def __len__(self):
        return len(self.comments)

    def get_lens(self):
        lengths = np.fromiter(
            ((min(self.maxlen, len(seq))) for seq in self.comments),
            dtype=np.int32)
        return lengths

    def collate_fn(self, batch):
        """
        Collate function for sequence bucketing
        Note: this need not be defined in this Class, can be standalone.

        :param batch: an iterable of N sets from __getitem__()
        :return: a tensor of comments, and targets
        """

        if self.split in ('train', 'valid'):
            comments, targets = zip(*batch)
        else:
            comments = batch

        lengths = [len(c) for c in comments]
        maxlen = max(lengths)
        padded_comments = []
        for i, c in enumerate(comments):
            padded_comments.append(c+[0]*(maxlen - lengths[i]))

        if self.split in ('train', 'valid'):
            return torch.LongTensor(padded_comments), torch.stack(targets)
        else:
            return torch.LongTensor(padded_comments)


class BucketSampler(Sampler):

    def __init__(self, data_source, sort_lens, bucket_size=None, batch_size=1024, shuffle_data=True):
        super().__init__(data_source)
        self.shuffle = shuffle_data
        self.batch_size = batch_size
        self.sort_lens = sort_lens
        self.bucket_size = bucket_size if bucket_size is not None else len(sort_lens)
        self.weights = None

        if not shuffle_data:
            self.index = self.prepare_buckets()
        else:
            self.index = None

    def set_weights(self, weights):
        assert weights >= 0
        total = np.sum(weights)
        if total != 1:
            weights = weights / total
        self.weights = weights

    def __iter__(self):
        indices = None
        if self.weights is not None:
            total = len(self.sort_lens)
            indices = np.random.choice(total, (total,), p=self.weights)
        if self.shuffle:
            self.index = self.prepare_buckets(indices)
        return iter(self.index)

    def get_reverse_indexes(self):
        indexes = np.zeros((len(self.index),), dtype=np.int32)
        for i, j in enumerate(self.index):
            indexes[j] = i
        return indexes

    def __len__(self):
        return len(self.sort_lens)

    def prepare_buckets(self, indices=None):
        lengths = - self.sort_lens
        assert self.bucket_size % self.batch_size == 0 or self.bucket_size == len(lengths)

        if indices is None:
            if self.shuffle:
                indices = shuffle(np.arange(len(lengths), dtype=np.int32))
                lengths = lengths[indices]
            else:
                indices = np.arange(len(lengths), dtype=np.int32)

        #  bucket iterator
        def divide_chunks(l, n):
            if n == len(l):
                yield np.arange(len(l), dtype=np.int32), l
            else:
                # looping till length l
                for i in range(0, len(l), n):
                    data = l[i:i + n]
                    yield np.arange(i, i + len(data), dtype=np.int32), data

        new_indices = []
        extra_batch_idx = None
        for chunk_index, chunk in divide_chunks(lengths, self.bucket_size):
            # sort indices in bucket by descending order of length
            indices_sorted = chunk_index[np.argsort(chunk)]

            batch_idxes = []
            for _, batch_idx in divide_chunks(indices_sorted, self.batch_size):
                if len(batch_idx) == self.batch_size:
                    batch_idxes.append(batch_idx.tolist())
                else:
                    assert extra_batch_idx is None
                    assert batch_idx is not None
                    extra_batch_idx = batch_idx.tolist()

            # shuffling batches within buckets
            if self.shuffle:
                batch_idxes = shuffle(batch_idxes)
            for batch_idx in batch_idxes:
                new_indices.extend(batch_idx)

        if extra_batch_idx is not None:
            new_indices.extend(extra_batch_idx)

        if not self.shuffle:
            self.original_indices = np.argsort(indices_sorted).tolist()
        return indices[new_indices]


def prepare_loader(x, y=None, batch_size=1024, split=None):
    assert split in {'train', 'valid', 'test'}
    dataset = Toxic_comments(x, y, split, MAX_SEQUENCE_LENGTH)
    if split == 'train':
        sampler = BucketSampler(dataset, dataset.get_lens(),
                                bucket_size=batch_size*50, batch_size=batch_size)
        return DataLoader(dataset, batch_size=batch_size, sampler=sampler,
                          collate_fn=dataset.collate_fn)
    else:
        sampler = BucketSampler(dataset, dataset.get_lens(),
                                batch_size=batch_size, shuffle_data=False)
        return DataLoader(dataset, batch_size=batch_size, sampler=sampler,
                          collate_fn=dataset.collate_fn), sampler.original_indices

In [7]:
# Metrics

SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive

def compute_auc(y_true, y_pred):
    try:
        return roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, pred_column):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[pred_column])

def compute_bpsn_auc(df, subgroup, label, pred_column):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[pred_column])

def compute_bnsp_auc(df, subgroup, label, pred_column):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[pred_column])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)

def calculate_overall_auc(df, pred_col, label_col):
    true_labels = df[label_col]
    predicted_labels = df[pred_col]
    return roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)

In [8]:
# Functions for the validation

def validate(val_loader, model, val_df, val_original_indices):
    model.eval()
    targets, scores, losses = [], [], []

    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device=device, dtype=torch.long), y.to(device=device, dtype=torch.float)
            score = model(x, attention_mask=(x>0))
            targets.append((y[:,0].cpu().numpy()>=0.5).astype(int))
            scores.append(torch.sigmoid(score[:,0]).cpu().numpy())

    targets = np.concatenate(targets)
    scores = np.concatenate(scores)
    val_auc = roc_auc_score(targets, scores)
    print('{"metric": "Val. AUC", "value": %.4f}' % (val_auc, ))
    
    val_scores = scores[val_original_indices]
    val_unbias_auc = check_unbias_auc(val_df, val_scores)
    print('{"metric": "Val. Unbiased AUC", "value": %.4f}' % (val_unbias_auc, ))
    
    return val_scores

def check_unbias_auc(df, scores, print_table=True):
    df[pred_column] = scores
    bias_metrics_df = compute_bias_metrics_for_model(df, identity_columns, pred_column, label_column)
    unbias_auc = get_final_metric(bias_metrics_df, calculate_overall_auc(df, pred_column, label_column))
    if print_table:
        print(bias_metrics_df)
    return unbias_auc

def eval_model(model, test_loader):
    model.eval()
    test_scores = []
    with torch.no_grad():
        for x in test_loader:
            x = x.to(device=device, dtype=torch.long)
            score = torch.sigmoid(model(x, attention_mask=(x>0))[:,0])
            test_scores.append(score.cpu().numpy())
    return np.concatenate(test_scores)

In [9]:
def train_val_split(train_x, train_y):
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    cv_indices = [(tr_idx, val_idx) for tr_idx, val_idx in kf.split(train_x, train_y)]
    return cv_indices

def load_and_preproc():
    train_df = pd.read_csv(DATA_DIR+'train.csv')
    test_df = pd.read_csv(DATA_DIR+'test.csv')
    train_df[identity_columns] = train_df[identity_columns].copy().fillna(0)

    sample_weights = np.ones(len(train_df))
    sample_weights += train_df[identity_columns].values.sum(1) * 3
    sample_weights += train_df[label_column].values * 8
    sample_weights /= sample_weights.max()
    train_tars = train_df[[label_column]+aux_columns+identity_columns].values
    train_tars = np.hstack([train_tars, sample_weights[:,None]]).astype('float32')

    train_df = convert_dataframe_to_bool(train_df)
    df = train_df[[label_column]+identity_columns].copy()
    df[label_column] = df[label_column].astype('uint8')

    return train_df[text_column], test_df[text_column], train_tars, df, test_df['id']

In [10]:
train_seq, x_test, train_tars, trn_df, test_id = load_and_preproc()
cv_indices = train_val_split(train_seq, (train_tars[:,0]>=0.5).astype(int))
trn_idx, val_idx = cv_indices[0]

print('tokenizing...')
t0 = time.time()
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, do_lower_case=True)
# x_val = convert_lines(train_seq[val_idx], MAX_SEQUENCE_LENGTH, tokenizer)
x_test = convert_lines(x_test, MAX_SEQUENCE_LENGTH, tokenizer)
print('tokenizing complete in {:.0f} seconds.'.format(time.time()-t0))

tokenizing...
tokenizing complete in 114 seconds.


In [11]:
# y_val = train_tars[val_idx]
# val_loader, val_original_indices = prepare_loader(x_val, y_val, batch_size, split='valid')
test_loader, test_original_indices = prepare_loader(x_test, batch_size=batch_size, split='test')
# val_df = trn_df.iloc[val_idx]

In [12]:
import gc
del train_seq, train_tars, trn_df
gc.collect()

7

In [13]:
print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
print(str(torch.cuda.memory_cached(device)/1000000 ) + 'M')

0.0M
0.0M


In [14]:
# BERT configuration file for model loading
bert_config = BertConfig(CONFIG_PATH + 'bert_config.json')

# model setup
torch.cuda.empty_cache()

print("Building model...")
model = BertForSequenceClassification(bert_config, num_labels=16)
model = model.to(device)

Building model...


In [15]:
print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
print(str(torch.cuda.memory_cached(device)/1000000 ) + 'M')

439.111168M
494.927872M


In [16]:
# class WeightAvg(object):
#     def __init__(self, model):
#         self.weight_copy = {}
#         for name, param in model.named_parameters():
#             if param.requires_grad:
#                 self.weight_copy[name] = param.data

#     def average_models(self, models):
#         nb_models = len(models)+1
#         for name in self.weight_copy.keys():
#             self.weight_copy[name] *= (1./nb_models)
#         for md in models:
#             for name, param in md.named_parameters():
#                 if param.requires_grad:
#                     self.weight_copy[name] += (1./nb_models) * param.data

#     def set_weights(self, avg_model):
#         for name, param in avg_model.named_parameters():
#             if param.requires_grad:
#                 param.data = self.weight_copy[name]

In [17]:
# model.load_state_dict(torch.load(MODEL1_PATH+'epk_1_finetuned_bert_pytorch.bin'))

# import copy
# model_2 = copy.deepcopy(model)
# model_2.load_state_dict(torch.load(MODEL2_PATH+'epk_2_finetuned_bert_pytorch.bin'))

# model_3 = copy.deepcopy(model_2)
# model_3.load_state_dict(torch.load(MODEL3_PATH+'epk_3_finetuned_bert_pytorch.bin'))

# avg_model = copy.deepcopy(model)
# wa = WeightAvg(model)
# wa.average_models([model_2, model_3])
# wa.set_weights(avg_model)
# avg_model = avg_model.to(device)

In [18]:
models = torch.load(MODEL_PATH+'epk_1_bert_models.pt')
models = models['model']
model.load_state_dict(models[list(models.keys())[-1]])

<All keys matched successfully>

In [19]:
# validation
t0 = time.time()
torch.cuda.empty_cache()

# val_preds = validate(val_loader, model, val_df, val_original_indices)
# test_preds = eval_model(model, test_loader)[test_original_indices]
test_preds = []
ckpt_weights = [2**e for e in range(len(models))]
for state in models.values():
    model.load_state_dict(state)
    test_preds.append(eval_model(model, test_loader)[test_original_indices])

time_elapsed = time.time() - t0
print('time = {:.0f}m {:.0f}s.'.format(time_elapsed // 60, time_elapsed % 60))

<All keys matched successfully>

<All keys matched successfully>

<All keys matched successfully>

<All keys matched successfully>

<All keys matched successfully>

<All keys matched successfully>

time = 24m 11s.


In [20]:
submission = pd.DataFrame.from_dict({
    'id': test_id,
#     'prediction': test_preds
    'prediction': np.average(test_preds, weights=ckpt_weights, axis=0)
})
submission.to_csv('submission.csv', index=False)

  return f(*args, **kwds)


In [21]:
# val_preds = []
# test_preds = []

In [22]:
# # validation
# t0 = time.time()
# torch.cuda.empty_cache()

# model.load_state_dict(torch.load(MODEL1_PATH+'epk_1_finetuned_bert_pytorch.bin'))
# val_scores = validate(val_loader, model, criterion, val_df, 1, val_original_indices)
# test_scores = eval_model(model, test_loader)[test_original_indices]
# val_preds.append(val_scores)
# test_preds.append(test_scores)

# time_elapsed = time.time() - t0
# print('time = {:.0f}m {:.0f}s.'.format(time_elapsed // 60, time_elapsed % 60))

In [23]:
# print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
# print(str(torch.cuda.memory_cached(device)/1000000 ) + 'M')

In [24]:
# # validation
# t0 = time.time()
# torch.cuda.empty_cache()

# model.load_state_dict(torch.load(MODEL2_PATH+'epk_2_finetuned_bert_pytorch.bin'))
# val_scores = validate(val_loader, model, criterion, val_df, 1, val_original_indices)
# test_scores = eval_model(model, test_loader)[test_original_indices]
# val_preds.append(val_scores)
# test_preds.append(test_scores)

# time_elapsed = time.time() - t0
# print('time = {:.0f}m {:.0f}s.'.format(time_elapsed // 60, time_elapsed % 60))

In [25]:
# print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
# print(str(torch.cuda.memory_cached(device)/1000000 ) + 'M')

In [26]:
# # validation
# t0 = time.time()
# torch.cuda.empty_cache()

# model.load_state_dict(torch.load(MODEL3_PATH+'epk_3_finetuned_bert_pytorch.bin'))
# val_scores = validate(val_loader, model, criterion, val_df, 1, val_original_indices)
# test_scores = eval_model(model, test_loader)[test_original_indices]
# val_preds.append(val_scores)
# test_preds.append(test_scores)

# time_elapsed = time.time() - t0
# print('time = {:.0f}m {:.0f}s.'.format(time_elapsed // 60, time_elapsed % 60))

In [27]:
# print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
# print(str(torch.cuda.memory_cached(device)/1000000 ) + 'M')
# torch.cuda.empty_cache()

In [28]:
# val_preds = np.mean(val_preds, 0)
# print(check_unbias_auc(val_df.copy(), val_preds))

In [29]:
# test_preds = np.mean(test_preds, 0)
# submission = pd.DataFrame.from_dict({
#     'id': test_id,
#     'prediction': test_preds
# })
# submission.to_csv('submission.csv', index=False)