In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_score
import os
import time
from tqdm import tnrange, tqdm_notebook
# https://nbviewer.jupyter.org/github/kaushaltrivedi/bert-toxic-comments-multilabel/blob/master/toxic-bert-multilabel-classification.ipynb?source=post_page-------

In [64]:

WD = os.getcwd()
DATA_DIR = os.path.join(WD, 'data','jigsaw-toxic-comment-classification-challenge','train.csv')

In [65]:
data = pd.read_csv(DATA_DIR)
# data = data.drop(['synopsis_source'],axis=1)
data.shape

(159571, 8)

In [66]:
data['total'] = data['toxic'] + data['severe_toxic']+data['obscene']+data['threat']+data['insult']+data['identity_hate']
data2=data[data['total']>1]
data = data2.drop(['total'], axis=1)

In [67]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0
51,001dc38a83d420cf,GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK T...,1,0,1,0,0,0
55,0020e7119b96eeeb,Stupid peace of shit stop deleting my stuff as...,1,1,1,0,1,0


In [10]:
train_data, valid_data = train_test_split(data, test_size=0.20, random_state=42)

train_data.shape, valid_data.shape

((127656, 8), (31915, 8))

# BERT Modeling

In [20]:
import torch
from pytorch_transformers import *
from pytorch_transformers.modeling_bert import BertPreTrainedModel
from pytorch_transformers.optimization import AdamW

from torch.utils.data import Dataset, DataLoader
from torch.nn import BCEWithLogitsLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

In [12]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_config = BertConfig.from_pretrained('bert-base-uncased')

In [14]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
140030,ed56f082116dcbd0,Grandma Terri Should Burn in Trash \nGrandma T...,1,0,0,0,0,0
159124,f8e3cd98b63bf401,", 9 May 2009 (UTC)\nIt would be easiest if you...",0,0,0,0,0,0
60006,a09e1bcf10631f9a,"""\n\nThe Objectivity of this Discussion is dou...",0,0,0,0,0,0
65432,af0ee0066c607eb8,Shelly Shock\nShelly Shock is. . .( ),0,0,0,0,0,0
154979,b734772b1a807e09,I do not care. Refer to Ong Teng Cheong talk p...,0,0,0,0,0,0


In [52]:
class ToxicDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_seq_length, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.dataframe)
    
    def get_sample_features(self, sample):
        tokenized_sample = self.tokenizer.tokenize(sample)
        
        tokenized_sample = ["[CLS]"] + tokenized_sample[:self.max_seq_length-2] + ["[SEP]"]
    
        input_ids = self.tokenizer.convert_tokens_to_ids(tokenized_sample)
        segment_ids = [0] * len(input_ids)
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (self.max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        
        assert len(input_ids) == self.max_seq_length
        assert len(input_mask) == self.max_seq_length
        assert len(segment_ids) == self.max_seq_length
        
        return input_ids, input_mask, segment_ids


    def __getitem__(self, idx):
        sample = self.dataframe.iloc[idx]['comment_text']
        label = self.dataframe.iloc[idx][2:]
        
        input_ids, input_mask, segment_ids = self.get_sample_features(sample)
        
        return torch.tensor(input_ids), torch.tensor(input_mask), torch.tensor(segment_ids), torch.tensor(label).type(torch.FloatTensor)
        

In [53]:
# class BertForMultiLabelClassification(BertPreTrainedModel):
#     """BERT model for classification.
#     This module is composed of the BERT model with a linear layer on top of
#     the pooled output.
#     Params:
#         `config`: a BertConfig class instance with the configuration to build a new model.
#         `num_labels`: the number of classes for the classifier. Default = 2.
#     Inputs:
#         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
#             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
#             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
#         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
#             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
#             a `sentence B` token (see BERT paper for more details).
#         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
#             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
#             input sequence length in the current batch. It's the mask that we typically use for attention when
#             a batch has varying length sentences.
#         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
#             with indices selected in [0, ..., num_labels].
#     Outputs:
#         if `labels` is not `None`:
#             Outputs the CrossEntropy classification loss of the output with the labels.
#         if `labels` is `None`:
#             Outputs the classification logits of shape [batch_size, num_labels].
#     Example usage:
#     ```python
#     # Already been converted into WordPiece token ids
#     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
#     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
#     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
#     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
#         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
#     num_labels = 2
#     model = BertForSequenceClassification(config, num_labels)
#     logits = model(input_ids, token_type_ids, input_mask)
#     ```
#     """
#     def __init__(self, config, num_labels=6):
#         super(BertForMultiLabelClassification, self).__init__(config)
#         self.num_labels = num_labels
#         self.bert = BertModel(config)
#         self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
#         self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
#         self.apply(self.init_weights)
        
#     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
#                 position_ids=None, head_mask=None):
        
#         outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
#                             attention_mask=attention_mask, head_mask=head_mask)
# #         print(len(outputs))
# #         print(outputs[0].shape)
#         pooled_output = outputs[1]
        
#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)

#         if labels is not None:
#             loss_fct = BCEWithLogitsLoss()
#             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
#             return loss, logits
#         else:
#             return logits
        
#     def freeze_bert_encoder(self):
#         for param in self.bert.parameters():
#             param.requires_grad = False
    
#     def unfreeze_bert_encoder(self):
#         for param in self.bert.parameters():
#             param.requires_grad = True

In [62]:
def train_model(dataloaders, model, optimizer, criterion, scheduler, num_epochs=2):
    since = time.time()
    step_sizes = {'train': len(dataloaders['train']), 
                     'valid': len(dataloaders['valid'])}

    for epoch in tnrange(int(num_epochs), desc="Epoch"):
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0
            running_acc = 0
        
            for step, batch in enumerate(tqdm_notebook(dataloaders[phase], desc=phase)):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                output = model(input_ids, segment_ids, input_mask)
                
                logits = output[0]
                loss = criterion(logits.view(-1, 6), label_ids.view(-1, 6))
                
                running_loss += loss.item()
                
                logits_numpy = logits.sigmoid().detach().cpu().numpy()
                labels_numpy = label_ids.detach().cpu().numpy()
    
#                 acc = np.mean(((logits_numpy>0.5)==labels_numpy), axis=1).sum()
                acc = jaccard_score(labels_numpy, logits_numpy.round(), average='samples')
                running_acc += acc
    
                if phase == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    scheduler.step()
            
            if phase == 'train':
                train_loss = running_loss / step_sizes[phase]
                train_acc = running_acc / step_sizes[phase]
            else:
                valid_loss = running_loss / step_sizes[phase]
                valid_acc = running_acc / step_sizes[phase]
                
                print('Epoch [{}/{}] train loss: {:.4f} acc: {:.4f} ' 
              'valid loss: {:.4f} acc: {:.4f}'.format(
                epoch+1, num_epochs,
                train_loss, train_acc, 
                valid_loss, valid_acc))
            
    return model

In [55]:
train_ds = ToxicDataset(train_data, bert_tokenizer, 128)
train_dl = DataLoader(train_ds,batch_size=16, shuffle=True)

val_ds = ToxicDataset(valid_data, bert_tokenizer, 128)
val_dl = DataLoader(val_ds,batch_size=16, shuffle=True)

dloaders = {'train':train_dl, 'valid':val_dl}

In [56]:
EPOCHS = 10
LEARNING_RATE = 3e-5
ADAM_EPSILON = 1e-6
WARMUP_STEPS = 0

t_total= len(train_dl) * EPOCHS

# model = BertForMultiLabelClassification(bert_config)
model = BertForSequenceClassification(bert_config)
model.to(device)

criterion = BCEWithLogitsLoss()

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=ADAM_EPSILON)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS, t_total=t_total)



In [57]:
start_time = time.time()
model = train_model(dloaders, model, optimizer, scheduler=scheduler, num_epochs=EPOCHS)
print('Training time: {:10f} minutes'.format((time.time()-start_time)/60))

HBox(children=(IntProgress(value=0, description='Epoch', max=10, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='train', max=7979, style=ProgressStyle(description_width='init…

torch.Size([16, 6]) 6
torch.Size([16, 6]) 6


  'true or predicted', average, ('jaccard',))


torch.Size([16, 6]) 6
torch.Size([16, 6]) 6
torch.Size([16, 6]) 6
torch.Size([16, 6]) 6


KeyboardInterrupt: 

In [58]:
train_ds[0]

(tensor([  101, 13055, 26568,  2323,  6402,  1999, 11669, 13055, 26568,  2003,
         11669,  1012,  1045,  5223, 13055, 26568,  1012,  1042,  1003,  1003,
          1047,  2014,  2000,  3109,   999,  6390,  1012,  6356,  1012,  6146,
          1012,  2871,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [61]:
bert_tokenizer.convert_tokens_to_ids("[SEP]")

102

In [1]:
from torch.utils.data import WeightedRandomSampler

In [16]:
list(WeightedRandomSampler([0.1, 0.9, 0.4, 0.7, 30.0, 0.6], 5, replacement=True))

[4, 4, 4, 4, 4]