In [None]:
#!pip install pytorch-pretrained-bert
#!pip install fast-bert
#!pip install tensorboardX
#!pip freeze

In [None]:
#!git clone https://github.com/NVIDIA/apex
#%cd apex
#!ls
#!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
#%cd ..

In [None]:
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining, BertConfig, BertForMaskedLM, BertForSequenceClassification
from pathlib import Path
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from fast_bert.prediction import BertClassificationPredictor

from fastai.text import Tokenizer, Vocab
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
import apex
import re

import datetime
    
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from pytorch_pretrained_bert.optimization import BertAdam

from fast_bert.modeling import BertForMultiLabelSequenceClassification
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc
from sklearn.metrics import classification_report, hamming_loss, roc_auc_score

import logging
import os

In [None]:
torch.cuda.empty_cache()

In [None]:
pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

In [None]:
DATA_PATH = Path("../datasets")
CROSS_FOLDS = Path("../datasets/cross_validation/")
BERT_DATA_PATH = Path("data/")
BERT_PATH = Path(".")
LABEL_PATH = Path(".")
LOG_PATH = Path("logs/")
OUTPUT_PATH = Path("models/")

model_state_dict = None
LOG_PATH.mkdir(exist_ok=True)
OUTPUT_PATH.mkdir(exist_ok=True)

# Model parameters

In [None]:
args = {
    "run_text": "multilabel sdgs with freezable layers - more epochs",
    "train_size": -1,
    "val_size": -1,
    "log_path": BERT_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "final-3epochs",
    "no_cuda": False,
    "bert_model": 'bert-large-uncased', 
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 512, 
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 4,
    "eval_batch_size": 4,
    "learning_rate": 1e-3, #1e-3 with three epochs, 0.07 loss
    "num_train_epochs": 3,
    "warmup_proportion": 0.1,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": True,
    "loss_scale": 128
}

In [None]:
import logging

logfile = str(BERT_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [None]:
#logger.info(args)

In [None]:
device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    multi_gpu = True
else:
    multi_gpu = False

# Create cross validation files

In [None]:
labels_index = [str(i) for i in range(1,18)]

"""
data_df = pd.read_csv(os.path.join(DATA_PATH, 'cleanup_labelled.csv'))
data_df.labels = data_df.labels.str.split('|').apply(lambda x: [int(i) for i in x])

mlb = MultiLabelBinarizer()

pattern = r"(indicator)(\s+\d+\.[\d+a-d]\.\d+)|(target)(\s+\d+\.[\d+a-d])|(sdgs|sdg|goals|goal)\W*\s+(,?\s*\b\d{1,2}\b[and\s\b\d{1,2}\b]*)"
masked_df = data_df.text.str.replace(pattern, ' SDGLABEL ', regex=True, flags=re.IGNORECASE)
masked_df = pd.DataFrame(masked_df.str.replace('  ', ' ', regex=True, flags=re.IGNORECASE))

x = masked_df[['text']].values # text
y = mlb.fit_transform(data_df.labels) # labels

columns = ['text'] + labels_index

for fold in os.listdir(CROSS_FOLDS):
    print(f"Creating {fold}")
    train_index = np.load(f"{CROSS_FOLDS}/{fold}/train.npy")
    val_index = np.load(f"{CROSS_FOLDS}/{fold}/val.npy")
    test_index = np.load(f"{CROSS_FOLDS}/{fold}/test.npy")
    
    x_train, x_val, x_test = x[train_index], x[val_index], x[test_index]
    y_train, y_val, y_test = y[train_index], y[val_index], y[test_index]
    
    train = pd.DataFrame(np.hstack((x_train, y_train)))
    val = pd.DataFrame(np.hstack((x_val, y_val)))
    test = pd.DataFrame(np.hstack((x_test, y_test)))
    
    fold_dir = Path(BERT_DATA_PATH/fold)
    fold_dir.mkdir(exist_ok=True)
    
    for split, name in [(train, "train"), (val, "val"), (test, "test")]:
        split.columns = columns
        split.to_csv(fold_dir/f'{name}_masked.csv')
        
print('Finished creating all cross validation sets.')
"""

In [None]:
metrics = []
#metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
#metrics.append({'name': 'roc_auc', 'function': roc_auc})
#metrics.append({'name': 'fbeta', 'function': fbeta})
metrics.append({'name': 'accuracy_single', 'function': accuracy_multilabel})

In [None]:
is_masked = ""
output_dir = OUTPUT_PATH/args['task_name']
output_dir.mkdir(exist_ok=True)

for fold in sorted(os.listdir(BERT_DATA_PATH)):
    if fold.startswith("fold_5"):
        print(f"Processing {fold} {is_masked}")

        fold_dir = output_dir/fold
        fold_dir.mkdir(exist_ok=True)
        
        databunch = BertDataBunch(data_dir=BERT_DATA_PATH/fold, 
                                  label_dir=LABEL_PATH, 
                                  tokenizer=args['bert_model'], 
                                  train_file=f'train{is_masked}.csv', 
                                  val_file=f'val{is_masked}.csv',
                                  test_data=None,
                                  text_col="text", 
                                  label_col=labels_index,
                                  batch_size_per_gpu=args['train_batch_size'], 
                                  max_seq_length=args['max_seq_length'], 
                                  multi_gpu=multi_gpu, 
                                  multi_label=True, 
                                  model_type='bert')

        learner = BertLearner.from_pretrained_model(databunch, 
                                                pretrained_path=args['bert_model'], 
                                                metrics=metrics, 
                                                device=device, 
                                                logger=logger, 
                                                finetuned_wgts_path=None, 
                                                warmup_steps=500,
                                                output_dir=fold_dir,
                                                is_fp16=args['fp16'],
                                                loss_scale=args['loss_scale'],
                                                multi_gpu=multi_gpu,  
                                                multi_label=True,
                                                logging_steps=50)
        learner.fit(args['num_train_epochs'], lr=args['learning_rate'], schedule_type="warmup_linear")
        learner.save_model()

# Load and evaluate results

In [None]:
def metrics_avg(models_testx_testy, labels_, thres=0.3):
    def calc(model, test_x, test_y):
        texts = [x[0] for x in test_x]
        predictions = model.predict_batch(texts)
        
        converted_preds = []
        for row in predictions:
            row_scores = sorted(row, key=lambda i: (int(i[0])))
            final = [y for x,y in row_scores]
            converted_preds.append(final)
        
        preds = np.array(converted_preds)>thres
        metrics = classification_report(test_y, preds, target_names=labels_, output_dict=True)
        metrics_df = pd.DataFrame.from_dict(metrics)
        h = hamming_loss(test_y, preds)
        roc = roc_auc_score(test_y, preds, average='micro')
        return metrics_df, h, roc

    count = 0
    model_1, test_x_first, test_y_first = models_testx_testy[0]
    metrics_agg, ham, roc = calc(model_1, test_x_first, test_y_first)
    n = len(models_testx_testy)

    for model, test_x, test_y in models_testx_testy[1:]:
        metrics, h, r = calc(model, test_x, test_y)
        metrics_agg += metrics
        ham += h
        roc += r
        count +=1
        print(count)

    return metrics_agg/n, ham/n, roc/n

In [None]:
loaded_models = []
data_df = pd.read_csv(os.path.join(DATA_PATH, 'cleanup_labelled.csv'))
data_df.labels = data_df.labels.str.split('|').apply(lambda x: [int(i) for i in x])

mlb = MultiLabelBinarizer()
x = data_df[['text']].values # text
y = mlb.fit_transform(data_df.labels) # labels


for fold in sorted(os.listdir(OUTPUT_PATH/f"{args['task_name']}")):
    if fold.startswith("fold"):
        print(f"Processing {fold}")
        
        # Load model
        fold_dir = OUTPUT_PATH/f"{args['task_name']}/{fold}/model_out"
        model = BertClassificationPredictor(model_path=fold_dir,  
                                        label_path=LABEL_PATH, 
                                        multi_label=True)
        
        # Load test data
        test_index = np.load(f"{CROSS_FOLDS}/{fold}/test.npy")
        x_test = x[test_index]
        y_test = y[test_index]
        
        loaded_models.append((model, x_test, y_test))
print(f"Finished loading the Bert models.")

In [None]:
avg_results = metrics_avg(loaded_models, labels_index)

In [None]:
avg_results[0].to_csv(f'results.csv', sep=';')

In [None]:
avg_results[0]

In [None]:
hl = round(avg_results[1],4)
roc_auc = round(avg_results[2],4)
print(f"hl;{hl}")
print(f"roc-auc;{roc_auc}")