In [None]:
#!pip install iterative-stratification
#!pip install pytorch-pretrained-bert
#!pip install fast-bert
#!pip install tensorboardX
#!pip freeze

In [None]:
#!git clone https://github.com/NVIDIA/apex
#%cd apex
#!ls
#!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
#%cd ..

In [6]:
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining, BertConfig, BertForMaskedLM, BertForSequenceClassification
from pathlib import Path
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from fastai.text import Tokenizer, Vocab
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
import apex
from sklearn.model_selection import train_test_split

import datetime
    
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from pytorch_pretrained_bert.optimization import BertAdam

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc
from sklearn.metrics import classification_report, hamming_loss, roc_auc_score

import logging

In [7]:
torch.cuda.empty_cache()

In [8]:
pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

In [10]:
DATA_PATH = Path("../datasets")
BERT_DATA_PATH = Path("data/")
BERT_PATH = Path(".")
LABEL_PATH = Path(".")
LOG_PATH = Path("logs/")
OUTPUT_PATH = Path("models/")

model_state_dict = None
LOG_PATH.mkdir(exist_ok=True)
OUTPUT_PATH.mkdir(exist_ok=True)

# Model parameters

In [38]:
args = {
    "run_text": "multilabel sdgs with freezable layers - more epochs",
    "train_size": -1,
    "val_size": -1,
    "log_path": BERT_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "sdgs_mutilabel-1_epochs",
    "no_cuda": False,
    "bert_model": 'bert-base-uncased',
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 256,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 16,
    "eval_batch_size": 16,
    "learning_rate": 5e-6,
    "num_train_epochs": 1,
    "warmup_proportion": 0.1,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": True,
    "loss_scale": 128
}

In [39]:
import logging

logfile = str(BERT_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [40]:
logger.info(args)

08/07/2019 12:33:47 - INFO - root -   {'run_text': 'multilabel sdgs with freezable layers - more epochs', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('.'), 'full_data_dir': PosixPath('../datasets'), 'data_dir': PosixPath('../datasets'), 'task_name': 'sdgs_mutilabel-1_epochs', 'no_cuda': False, 'bert_model': 'bert-base-uncased', 'output_dir': PosixPath('models'), 'max_seq_length': 256, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 16, 'eval_batch_size': 16, 'learning_rate': 5e-06, 'num_train_epochs': 1, 'warmup_proportion': 0.1, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': False, 'fp16': True, 'loss_scale': 128}


In [41]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=args['do_lower_case'])

08/07/2019 12:33:48 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/jupyter/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [42]:
device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    multi_gpu = True
else:
    multi_gpu = False

In [43]:
label_cols = [str(i) for i in range(1,18)]

In [44]:
"""
data_df = pd.read_csv(os.path.join(DATA_PATH, 'cleanup_labelled.csv'))
data_df.labels = data_df.labels.str.split('|').apply(lambda x: [int(i) for i in x])

mlb = MultiLabelBinarizer()
text = data_df[['text']].values # text
labels = mlb.fit_transform(data_df.labels) # labels
columns = ['text'] + label_cols

data = pd.DataFrame(np.hstack((text,labels)))
count = 0

mskf = MultilabelStratifiedKFold(n_splits=10, random_state=0)
for train_index, test_index in mskf.split(text, labels):
    count += 1
    x_train = text[train_index]
    y_train = labels[train_index]
    x_test = text[test_index]
    y_test = labels[test_index]
    train = pd.DataFrame(np.hstack((x_train,y_train)))
    test = pd.DataFrame(np.hstack((x_test,y_test)))
    train.columns = columns
    test.columns = columns
    
    train.to_csv(BERT_DATA_PATH/f'train{count}.csv')
    test.to_csv(BERT_DATA_PATH/f'test{count}.csv')
print('Finished')
"""

"\ndata_df = pd.read_csv(os.path.join(DATA_PATH, 'cleanup_labelled.csv'))\ndata_df.labels = data_df.labels.str.split('|').apply(lambda x: [int(i) for i in x])\n\nmlb = MultiLabelBinarizer()\ntext = data_df[['text']].values # text\nlabels = mlb.fit_transform(data_df.labels) # labels\ncolumns = ['text'] + label_cols\n\ndata = pd.DataFrame(np.hstack((text,labels)))\ncount = 0\n\nmskf = MultilabelStratifiedKFold(n_splits=10, random_state=0)\nfor train_index, test_index in mskf.split(text, labels):\n    count += 1\n    x_train = text[train_index]\n    y_train = labels[train_index]\n    x_test = text[test_index]\n    y_test = labels[test_index]\n    train = pd.DataFrame(np.hstack((x_train,y_train)))\n    test = pd.DataFrame(np.hstack((x_test,y_test)))\n    train.columns = columns\n    test.columns = columns\n    \n    train.to_csv(BERT_DATA_PATH/f'train{count}.csv')\n    test.to_csv(BERT_DATA_PATH/f'test{count}.csv')\nprint('Finished')\n"

In [45]:
metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})
metrics.append({'name': 'accuracy_single', 'function': accuracy_multilabel})

In [46]:
results = []
output_dir = OUTPUT_PATH/args['task_name']
output_dir.mkdir(exist_ok=True)

for count in range(1,11):
    print(f"Fold {count}")
    fold_dir = output_dir/f"fold_{count}"
    fold_dir.mkdir(exist_ok=True)
    databunch = BertDataBunch(BERT_DATA_PATH, LABEL_PATH, tokenizer='bert-base-uncased', train_file=f'train{count}.csv', val_file=f'test{count}.csv',
                          test_data=None,
                          text_col="text", label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=multi_gpu, multi_label=True, model_type='bert')
    learner = BertLearner.from_pretrained_model(databunch, 
                                            pretrained_path='bert-base-uncased', 
                                            metrics=metrics, 
                                            device=device, 
                                            logger=logger, 
                                            finetuned_wgts_path=None, 
                                            warmup_steps=500,
                                            output_dir=fold_dir,
                                            is_fp16=args['fp16'],
                                            loss_scale=args['loss_scale'],
                                            multi_gpu=True,  
                                            multi_label=True,
                                            logging_steps=50)
    learner.fit(args['num_train_epochs'], lr=args['learning_rate'], schedule_type="warmup_linear")
    learner.save_model()

Fold 1
08/07/2019 12:33:50 - INFO - pytorch_transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/jupyter/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
08/07/2019 12:33:53 - INFO - root -   Loading features from cached file data/cache/cached_train_multi_label_256
08/07/2019 12:33:53 - INFO - root -   Loading features from cached file data/cache/cached_dev_multi_label_256
08/07/2019 12:33:54 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/jupyter/.cache/torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
08/07/2019 12:33:54 - INFO - pytorch_transf

08/07/2019 12:34:42 - INFO - root -   Running evaluation
08/07/2019 12:34:42 - INFO - root -     Num examples = 1711
08/07/2019 12:34:42 - INFO - root -     Batch size = 16


08/07/2019 12:35:10 - INFO - root -   eval_loss after step 50: 0.6220082306416235: 
08/07/2019 12:35:10 - INFO - root -   eval_accuracy_thresh after step 50: 0.7193247675895691: 
08/07/2019 12:35:10 - INFO - root -   eval_roc_auc after step 50: 0.5120465478379657: 
08/07/2019 12:35:10 - INFO - root -   eval_fbeta after step 50: 0.278542160987854: 
08/07/2019 12:35:10 - INFO - root -   eval_accuracy_single after step 50: 0.07656341320864991: 
08/07/2019 12:35:10 - INFO - root -   lr after step 50: 5.000000000000001e-07
08/07/2019 12:35:10 - INFO - root -   train_loss after step 50: 0.6352895820140838
08/07/2019 12:35:52 - INFO - root -   Running evaluation
08/07/2019 12:35:52 - INFO - root -     Num examples = 1711
08/07/2019 12:35:52 - INFO - root -     Batch size = 16


08/07/2019 12:36:20 - INFO - root -   eval_loss after step 100: 0.5934532117620807: 
08/07/2019 12:36:20 - INFO - root -   eval_accuracy_thresh after step 100: 0.7813456058502197: 
08/07/2019 12:36:20 - INFO - root -   eval_roc_auc after step 100: 0.5150970877458608: 
08/07/2019 12:36:20 - INFO - root -   eval_fbeta after step 100: 0.2761010527610779: 
08/07/2019 12:36:20 - INFO - root -   eval_accuracy_single after step 100: 0.07364114552893045: 
08/07/2019 12:36:20 - INFO - root -   lr after step 100: 1.0000000000000002e-06
08/07/2019 12:36:20 - INFO - root -   train_loss after step 100: 0.614641934633255
08/07/2019 12:37:03 - INFO - root -   Running evaluation
08/07/2019 12:37:03 - INFO - root -     Num examples = 1711
08/07/2019 12:37:03 - INFO - root -     Batch size = 16


08/07/2019 12:37:32 - INFO - root -   eval_loss after step 150: 0.5537107107795287: 
08/07/2019 12:37:32 - INFO - root -   eval_accuracy_thresh after step 150: 0.8325024843215942: 
08/07/2019 12:37:32 - INFO - root -   eval_roc_auc after step 150: 0.512167242506002: 
08/07/2019 12:37:32 - INFO - root -   eval_fbeta after step 150: 0.27154773473739624: 
08/07/2019 12:37:32 - INFO - root -   eval_accuracy_single after step 150: 0.061367621274108705: 
08/07/2019 12:37:32 - INFO - root -   lr after step 150: 1.5e-06
08/07/2019 12:37:32 - INFO - root -   train_loss after step 150: 0.5790952491760254
08/07/2019 12:38:14 - INFO - root -   Running evaluation
08/07/2019 12:38:14 - INFO - root -     Num examples = 1711
08/07/2019 12:38:14 - INFO - root -     Batch size = 16


08/07/2019 12:38:43 - INFO - root -   eval_loss after step 200: 0.5044232208595097: 
08/07/2019 12:38:43 - INFO - root -   eval_accuracy_thresh after step 200: 0.8895382881164551: 
08/07/2019 12:38:43 - INFO - root -   eval_roc_auc after step 200: 0.5106752310172833: 
08/07/2019 12:38:43 - INFO - root -   eval_fbeta after step 200: 0.26902636885643005: 
08/07/2019 12:38:43 - INFO - root -   eval_accuracy_single after step 200: 0.05201636469900643: 
08/07/2019 12:38:43 - INFO - root -   lr after step 200: 2.0000000000000003e-06
08/07/2019 12:38:43 - INFO - root -   train_loss after step 200: 0.5348264163732529
08/07/2019 12:38:58 - INFO - root -   Running evaluation
08/07/2019 12:38:58 - INFO - root -     Num examples = 1711
08/07/2019 12:38:58 - INFO - root -     Batch size = 16


08/07/2019 12:39:27 - INFO - root -   eval_loss after epoch 1: 0.4853118323834143: 
08/07/2019 12:39:27 - INFO - root -   eval_accuracy_thresh after epoch 1: 0.9129852056503296: 
08/07/2019 12:39:27 - INFO - root -   eval_roc_auc after epoch 1: 0.5114115754526454: 
08/07/2019 12:39:27 - INFO - root -   eval_fbeta after epoch 1: 0.2674858272075653: 
08/07/2019 12:39:27 - INFO - root -   eval_accuracy_single after epoch 1: 0.049678550555230856: 
Fold 2
08/07/2019 12:39:28 - INFO - pytorch_transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/jupyter/.cache/torch/pytorch_transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
08/07/2019 12:39:31 - INFO - root -   Loading features from cached file data/cache/cached_train_multi_label_256
08/07/2019 12:39:32 - INFO - root -   Loading features from cached file dat

RuntimeError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 7.43 GiB total capacity; 6.89 GiB already allocated; 24.94 MiB free; 37.93 MiB cached)

In [None]:
learner.save_model()

In [None]:
learner.validate()

In [None]:
test = pd.read_csv(BERT_DATA_PATH/'test1.csv')

In [None]:
validation = learner.predict_batch(test.text.values.tolist())

In [None]:
converted_preds = []

for row in validation:
    row_scores = sorted(row, key=lambda i: (int(i[0])))
    final = [y for x,y in row_scores]
    converted_preds.append(final)

In [None]:
preds = np.array(converted_preds); preds

In [None]:
len(test[label_cols])

In [None]:
results = classification_report(test[label_cols], preds>0.25, target_names=databunch.labels, output_dict=True)
results

In [None]:
hamming_loss(test[label_cols], preds>0.25)

In [None]:
roc_auc_score(test[label_cols], preds>0.25)