## UHH-LT at SemEval-2020 Task 12: Fine-Tuning of Pre-Trained Transformer Networks for Offensive Language Detection

#### Step 1: Installation of requirements

Run the command pip -r install requirements.txt

In [None]:
!pip install tensorboardX
!pip install spacy
!pip install flair
!pip install nltk

In [None]:
import os
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
torch.cuda.current_device()

In [2]:
import pandas as pd
import numpy as np
import pickle
import csv
from nltk.tokenize import sent_tokenize, word_tokenize
from tensorboardX import SummaryWriter
from os import listdir
from os.path import isfile, join

In [3]:
# Use this code when on GPU
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=7

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=7


In [4]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [5]:
torch.cuda.empty_cache()

In [6]:
#Load dataset
import datasets as ds
#For Task A
dataset = ds.OffensEvalData2020('datasets/OffensEval20', n_max=-1)
s_train, s_test = dataset.getData()

Counter({'NOT': 9460, 'OFF': 4640})


In [7]:
#Task A Labels
id2label = {0:'NOT', 1:'OFF'}
label2id = {'NOT':0, 'OFF':1}

In [8]:
dev_n = -1
X = []
for i, t in enumerate(s_train.texts):
    if i == dev_n:
        break
    X.append((t, s_train.labels[i]))
X_test = []
for i, t in enumerate(s_test.texts):
    X_test.append((t, s_test.ids[i]))

In [9]:
X_train = X
X_dev = X_test

In [10]:
import importlib
#importlib.reload(utils)
import utils
convert_examples_to_features = utils.convert_examples_to_features
importlib.reload(utils)

<module 'utils' from '/raid/seid/par4sem/argmining19-same-side-classification/sharedtask/uhhlt-offenseval2020/utils.py'>

In [11]:
from __future__ import absolute_import, division, print_function

import glob
import logging
import os
import random
import json

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
import random
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from tensorboardX import SummaryWriter

from transformers import (WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer,
                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer, 
                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
                                  RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
                                  AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer,
                                  T5Config, T5Tokenizer,
                                  XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer)

from transformers import AdamW, get_linear_schedule_with_warmup

from utils_classification import (convert_examples_to_features, output_modes, processors)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

#### Step 2: Select the model by changing the index

In [12]:
#Only for the best performing model Albert
dict_model_names = {1: 'albert-base-v1',
                    2: 'albert-large-v1',
                    3: 'albert-xlarge-v1',
                    4: 'albert-xxlarge-v1', 
                    
                    5: 'albert-base-v2', 
                    6: 'albert-large-v2', 
                    7: 'albert-xlarge-v2', 
                    8: 'albert-xxlarge-v2'}

In [13]:
args = {
    'data_dir': 'data/',
    'model_type':  'albert',
    'model_name': str(dict_model_names[4]), #As per the required model, change the index
    'task_name': 'offensiveA',
    'output_dir': 'oe2020-albert-A/',
    'cache_dir': 'cache/',
    'do_train': True,
    'do_eval': True,
    'fp16': False,
    'fp16_opt_level': 'O1',
    'max_seq_length': 128,
    'output_mode': 'classification',
    'train_batch_size': 4,
    'eval_batch_size': 4,

    'gradient_accumulation_steps': 1,
    'num_train_epochs': 6,
    'weight_decay': 0,
    'learning_rate': 5e-6,
    'adam_epsilon': 1e-9,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,


    'logging_steps': 0,
    'evaluate_during_training': True,
    'save_steps': 1000,
    'eval_all_checkpoints': True,
    'overwrite_output_dir': False,
    'reprocess_input_data': True,
    'notes': 'Offensive language classification task'
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Output dir: " + str(args['output_dir']))
print("Model Name: " + str(args['model_name']))
args['output_dir'] = args['output_dir']+args['task_name']+"/"+args['model_name']+"/"
print("Output Dir: " + str(args['output_dir']))

Output dir: oe2020-albert-A/
Model Name: albert-xxlarge-v1
Output Dir: oe2020-albert-A/offensiveA/albert-xxlarge-v1/


In [14]:
device

device(type='cuda')

In [15]:
MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
    't5': (T5Config, T5Tokenizer),
    'xlmroberta': (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer)
}

config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]

In [16]:
config = config_class.from_pretrained(args['model_name'], num_labels=2, finetuning_task=args['task_name']) #Task A
tokenizer = tokenizer_class.from_pretrained((args['model_name']))

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-config.json from cache at /srv/home/yimam/.cache/torch/transformers/9209cfee61d6481030a59afb22ba82701a38c5d9d67640d5a21bd0fc029b8292.15578b4c95b8547919f1737e7133bbfc2fd26dc7a62a94855f828bff4407000d
INFO:transformers.configuration_utils:Model config AlbertConfig {
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "finetuning_task": "offensiveA",
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 16384,
  "layer_norm_eps": 1e-12,
  "layers_to_keep": [],
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 64,
  "num_hid

In [17]:
task = args['task_name']
processor = processors[task](X_train, X_dev)
label_list = processor.get_labels()
num_labels = len(label_list)

In [18]:
#For Task A: ['NOT', 'OFF']
print("Number of labels is: " + str(num_labels))
print("The labels for Task A are: " + str(label_list))

Number of labels is: 2
The labels for Task A are: ['NOT', 'OFF']


# Training

#### Step 3: Load model and train them on the loaded dataset

In [19]:
model = model_class.from_pretrained(args['model_name'], num_labels=num_labels)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-config.json from cache at /srv/home/yimam/.cache/torch/transformers/9209cfee61d6481030a59afb22ba82701a38c5d9d67640d5a21bd0fc029b8292.15578b4c95b8547919f1737e7133bbfc2fd26dc7a62a94855f828bff4407000d
INFO:transformers.configuration_utils:Model config AlbertConfig {
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 16384,
  "layer_norm_eps": 1e-12,
  "layers_to_keep": [],
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 64,
  "num_hidden_groups": 1,
  "num_hidden_layer

In [20]:
print(torch.cuda.current_device())

0


In [21]:
%%capture
model.to(device)

In [22]:
def load_and_cache_examples(task, tokenizer, evaluate=False):
    processor = processors[task](X_train, X_dev)
    output_mode = args['output_mode']
    
    mode = 'dev' if evaluate else 'train'
    cached_features_file = os.path.join(args['data_dir'], f"cached_{mode}_{args['model_name']}_{args['max_seq_length']}_{task}")
    
    if os.path.exists(cached_features_file) and not args['reprocess_input_data']:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
               
    else:
        logger.info("Creating features from dataset file at %s", args['data_dir'])
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(args['data_dir']) if evaluate else processor.get_train_examples(args['data_dir'])
        
        features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer, output_mode,
            cls_token_at_end=bool(args['model_type'] in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0,
            pad_on_left=bool(args['model_type'] in ['xlnet']),                 # pad on the left for xlnet
            pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0)
        
        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)
        
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset

                                        
from pprint import pprint
                                        
def train(train_dataset, model, tokenizer):
    tb_writer = SummaryWriter()
    
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])
    
    t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']
    
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'], num_training_steps=t_total)
    
    if args['fp16']:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16_opt_level'])
        
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Total train batch size  = %d", args['train_batch_size'])
    logger.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args['num_train_epochs']), desc="Epoch")
    
    epoch_i = 0
    max_metric = 0
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        epoch_i += 1
        print("Training Epoch %d" % epoch_i)
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
            print("\r%f" % loss, end='')

            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            if args['fp16']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm'])
                
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])

            tr_loss += loss.item()
            if (step + 1) % args['gradient_accumulation_steps'] == 0:

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()

                global_step += 1

                if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
                    # Log metrics
                    if args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
                        results, _ = evaluate(model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args['logging_steps'], global_step)
                    logging_loss = tr_loss

        # Save model checkpoint
        output_dir = os.path.join(args['output_dir'], 'checkpoint-{}'.format(epoch_i))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        logger.info("Saving model checkpoint to %s", output_dir)


    return global_step, tr_loss / global_step

In [23]:
from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from scipy.stats import pearsonr

def get_mismatched(labels, preds):
    mismatched = labels != preds
    examples = processor.get_dev_examples(args['data_dir'])
    wrong = [i for (i, v) in zip(examples, mismatched) if v]
    
    return wrong

def get_eval_report(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    return {
        "mcc": mcc,
        #These are uncommented in Seid's code
         "tp": tp,
         "tn": tn,
         "fp": fp,
         "fn": fn,
        "acc" : acc,
        "f1" : f1
    }, get_mismatched(labels, preds)

def compute_metrics(task_name, preds, labels):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds)

def evaluate(model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args['output_dir']

    results = {}
    EVAL_TASK = args['task_name']

    eval_dataset = load_and_cache_examples(EVAL_TASK, tokenizer, evaluate=True)
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)


    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size'])

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if args['output_mode'] == "classification":
        preds = np.argmax(preds, axis=1)
    elif args['output_mode'] == "regression":
        preds = np.squeeze(preds)
    # print(preds)
    result, wrong = compute_metrics(EVAL_TASK, preds, out_label_ids)
    results.update(result) 

    return results, wrong

In [24]:
if args['do_train']:
    train_dataset = load_and_cache_examples(task, tokenizer)
    global_step, tr_loss = train(train_dataset, model, tokenizer)
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

INFO:__main__:Creating features from dataset file at data/
100%|██████████| 14100/14100 [00:01<00:00, 10739.82it/s]
INFO:__main__:Saving features into cached file data/cached_train_albert-xxlarge-v1_128_offensiveA
INFO:__main__:***** Running training *****
INFO:__main__:  Num examples = 14100
INFO:__main__:  Num Epochs = 6
INFO:__main__:  Total train batch size  = 4
INFO:__main__:  Gradient Accumulation steps = 1
INFO:__main__:  Total optimization steps = 21150


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=6.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3525.0, style=ProgressStyle(description_w…

Training Epoch 1
0.278557

INFO:transformers.configuration_utils:Configuration saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-1/config.json





INFO:transformers.modeling_utils:Model weights saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-1/pytorch_model.bin
INFO:__main__:Saving model checkpoint to oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-1


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3525.0, style=ProgressStyle(description_w…

Training Epoch 2
0.010202

INFO:transformers.configuration_utils:Configuration saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-2/config.json





INFO:transformers.modeling_utils:Model weights saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-2/pytorch_model.bin
INFO:__main__:Saving model checkpoint to oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-2


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3525.0, style=ProgressStyle(description_w…

Training Epoch 3
0.571457

INFO:transformers.configuration_utils:Configuration saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-3/config.json





INFO:transformers.modeling_utils:Model weights saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-3/pytorch_model.bin
INFO:__main__:Saving model checkpoint to oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-3


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3525.0, style=ProgressStyle(description_w…

Training Epoch 4
0.014926

INFO:transformers.configuration_utils:Configuration saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-4/config.json





INFO:transformers.modeling_utils:Model weights saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-4/pytorch_model.bin
INFO:__main__:Saving model checkpoint to oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-4


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3525.0, style=ProgressStyle(description_w…

Training Epoch 5
0.002159

INFO:transformers.configuration_utils:Configuration saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-5/config.json





INFO:transformers.modeling_utils:Model weights saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-5/pytorch_model.bin
INFO:__main__:Saving model checkpoint to oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-5


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3525.0, style=ProgressStyle(description_w…

Training Epoch 6
0.000094

INFO:transformers.configuration_utils:Configuration saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-6/config.json





INFO:transformers.modeling_utils:Model weights saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-6/pytorch_model.bin
INFO:__main__:Saving model checkpoint to oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-6
INFO:__main__: global_step = 21150, average loss = 0.40186470678018904





#### Step 4. Save the pre-trained check points under the respective folders

In [25]:
#run this only for training
if args['do_train']:
    if not os.path.exists(args['output_dir']):
            os.makedirs(args['output_dir'])
    logger.info("Saving model checkpoint to %s", args['output_dir'])
    
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(args['output_dir'])
    tokenizer.save_pretrained(args['output_dir'])
    torch.save(args, os.path.join(args['output_dir'], 'training_args.bin'))

INFO:__main__:Saving model checkpoint to oe2020-albert-A/offensiveA/albert-xxlarge-v1/
INFO:transformers.configuration_utils:Configuration saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/config.json
INFO:transformers.modeling_utils:Model weights saved in oe2020-albert-A/offensiveA/albert-xxlarge-v1/pytorch_model.bin


# Predict sentences

#### Step 5: Evaluate the pre-trained model on the test set

In [26]:
def prepare_prediction(task, X_predict, tokenizer):
    processor = processors[task](X_predict, None)
    output_mode = args['output_mode']
    examples = processor.get_train_examples(None)
    features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer, output_mode,
        cls_token_at_end=bool(args['model_type'] in ['xlnet']),            # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        sep_token=tokenizer.sep_token,
        cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0,
        pad_on_left=bool(args['model_type'] in ['xlnet']),                 # pad on the left for xlnet
        pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0)
    
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset

In [27]:
def predict_sentences(sentences):
    X = [(s, 'OFF') for s in sentences]
    predict_dataset = prepare_prediction(task, X, tokenizer)
    eval_sampler = SequentialSampler(predict_dataset)
    eval_dataloader = DataLoader(predict_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size'])
    prefix = ""
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(predict_dataset))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    
    sm = torch.nn.Softmax(dim=1)
    probabilities = sm(torch.from_numpy(preds)).numpy()
    # relevancy_scores = probabilities[:,1]

    return probabilities

In [28]:
#run this for evaluation
test_sentences, test_ids = zip(*X_test)
fold_model_dirs = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + '/**/' + WEIGHTS_NAME, recursive=True)))
emsemble_preds = np.empty((len(test_sentences), len(fold_model_dirs)))
for i, fold_dir in enumerate(fold_model_dirs):
    print(fold_dir)
    model = model_class.from_pretrained(fold_dir)
    model.to(device)
    prob_scores = predict_sentences(test_sentences)
    predicted_labels = [a.argmax() for a in prob_scores]
    emsemble_preds[:, i] = predicted_labels

oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-1


INFO:transformers.configuration_utils:loading configuration file oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-1/config.json
INFO:transformers.configuration_utils:Model config AlbertConfig {
  "architectures": [
    "AlbertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 16384,
  "layer_norm_eps": 1e-12,
  "layers_to_keep": [],
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 64,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30000
}

INFO:transformers.modeling_utils:loading weights file oe2020-albert-A/offensiveA

HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=972.0, style=ProgressStyle(description_w…

INFO:transformers.configuration_utils:loading configuration file oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-2/config.json
INFO:transformers.configuration_utils:Model config AlbertConfig {
  "architectures": [
    "AlbertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 16384,
  "layer_norm_eps": 1e-12,
  "layers_to_keep": [],
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 64,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30000
}

INFO:transformers.modeling_utils:loading weights file oe2020-albert-A/offensiveA


oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-2


INFO:transformers.modeling_utils:All model checkpoint weights were used when initializing AlbertForSequenceClassification.

INFO:transformers.modeling_utils:All the weights of AlbertForSequenceClassification were initialized from the model checkpoint at oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-2.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use AlbertForSequenceClassification for predictions without further training.
100%|██████████| 3887/3887 [00:00<00:00, 11228.01it/s]
INFO:__main__:***** Running evaluation  *****
INFO:__main__:  Num examples = 3887
INFO:__main__:  Batch size = 4


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=972.0, style=ProgressStyle(description_w…

INFO:transformers.configuration_utils:loading configuration file oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-3/config.json
INFO:transformers.configuration_utils:Model config AlbertConfig {
  "architectures": [
    "AlbertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 16384,
  "layer_norm_eps": 1e-12,
  "layers_to_keep": [],
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 64,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30000
}

INFO:transformers.modeling_utils:loading weights file oe2020-albert-A/offensiveA


oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-3


INFO:transformers.modeling_utils:All model checkpoint weights were used when initializing AlbertForSequenceClassification.

INFO:transformers.modeling_utils:All the weights of AlbertForSequenceClassification were initialized from the model checkpoint at oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-3.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use AlbertForSequenceClassification for predictions without further training.
100%|██████████| 3887/3887 [00:00<00:00, 12250.82it/s]
INFO:__main__:***** Running evaluation  *****
INFO:__main__:  Num examples = 3887
INFO:__main__:  Batch size = 4


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=972.0, style=ProgressStyle(description_w…

INFO:transformers.configuration_utils:loading configuration file oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-4/config.json
INFO:transformers.configuration_utils:Model config AlbertConfig {
  "architectures": [
    "AlbertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 16384,
  "layer_norm_eps": 1e-12,
  "layers_to_keep": [],
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 64,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30000
}

INFO:transformers.modeling_utils:loading weights file oe2020-albert-A/offensiveA


oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-4


INFO:transformers.modeling_utils:All model checkpoint weights were used when initializing AlbertForSequenceClassification.

INFO:transformers.modeling_utils:All the weights of AlbertForSequenceClassification were initialized from the model checkpoint at oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-4.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use AlbertForSequenceClassification for predictions without further training.
100%|██████████| 3887/3887 [00:00<00:00, 11509.71it/s]
INFO:__main__:***** Running evaluation  *****
INFO:__main__:  Num examples = 3887
INFO:__main__:  Batch size = 4


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=972.0, style=ProgressStyle(description_w…

INFO:transformers.configuration_utils:loading configuration file oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-5/config.json
INFO:transformers.configuration_utils:Model config AlbertConfig {
  "architectures": [
    "AlbertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 16384,
  "layer_norm_eps": 1e-12,
  "layers_to_keep": [],
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 64,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30000
}

INFO:transformers.modeling_utils:loading weights file oe2020-albert-A/offensiveA


oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-5


INFO:transformers.modeling_utils:All model checkpoint weights were used when initializing AlbertForSequenceClassification.

INFO:transformers.modeling_utils:All the weights of AlbertForSequenceClassification were initialized from the model checkpoint at oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-5.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use AlbertForSequenceClassification for predictions without further training.
100%|██████████| 3887/3887 [00:00<00:00, 6012.05it/s]
INFO:__main__:***** Running evaluation  *****
INFO:__main__:  Num examples = 3887
INFO:__main__:  Batch size = 4


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=972.0, style=ProgressStyle(description_w…

INFO:transformers.configuration_utils:loading configuration file oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-6/config.json
INFO:transformers.configuration_utils:Model config AlbertConfig {
  "architectures": [
    "AlbertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 16384,
  "layer_norm_eps": 1e-12,
  "layers_to_keep": [],
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 64,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30000
}

INFO:transformers.modeling_utils:loading weights file oe2020-albert-A/offensiveA


oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-6


INFO:transformers.modeling_utils:All model checkpoint weights were used when initializing AlbertForSequenceClassification.

INFO:transformers.modeling_utils:All the weights of AlbertForSequenceClassification were initialized from the model checkpoint at oe2020-albert-A/offensiveA/albert-xxlarge-v1/checkpoint-6.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use AlbertForSequenceClassification for predictions without further training.
100%|██████████| 3887/3887 [00:00<00:00, 11054.03it/s]
INFO:__main__:***** Running evaluation  *****
INFO:__main__:  Num examples = 3887
INFO:__main__:  Batch size = 4


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=972.0, style=ProgressStyle(description_w…

INFO:transformers.configuration_utils:loading configuration file oe2020-albert-A/offensiveA/albert-xxlarge-v1/config.json
INFO:transformers.configuration_utils:Model config AlbertConfig {
  "architectures": [
    "AlbertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 16384,
  "layer_norm_eps": 1e-12,
  "layers_to_keep": [],
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 64,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30000
}

INFO:transformers.modeling_utils:loading weights file oe2020-albert-A/offensiveA/albert-xxlar


oe2020-albert-A/offensiveA/albert-xxlarge-v1


INFO:transformers.modeling_utils:All model checkpoint weights were used when initializing AlbertForSequenceClassification.

INFO:transformers.modeling_utils:All the weights of AlbertForSequenceClassification were initialized from the model checkpoint at oe2020-albert-A/offensiveA/albert-xxlarge-v1.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use AlbertForSequenceClassification for predictions without further training.
100%|██████████| 3887/3887 [00:00<00:00, 11521.67it/s]
INFO:__main__:***** Running evaluation  *****
INFO:__main__:  Num examples = 3887
INFO:__main__:  Batch size = 4


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=972.0, style=ProgressStyle(description_w…




In [29]:
#run this if the previous code is snippet is run. This saves the ensemble predictions to the file
pickle.dump(emsemble_preds, file=open(os.path.join(args['output_dir'], "testset_predictions.p"), "wb"))
emsemble_preds

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [30]:
mean_preds = emsemble_preds.mean(axis=1)
mean_preds.tolist()

[1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.7142857142857143,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.5714285714285714,
 1.0,
 0.0,
 0.0,
 1.0,
 0.8571428571428571,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.8571428571428571,
 1.0,
 0.0,
 1.0,
 0.0,
 0.8571428571428571,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.8571428571428571,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.14285714285714285,
 1.0,
 0.0,
 0.8571428571428571,
 0.0,
 0.8571428571428571,
 0.0,
 0.0,
 0.14285714285714285,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0

# Import all the Predictions and Majority Vote

Once all the model are available, run only this section to ensemble the models and generated the majority vote as mentioned in the paper. 

In [31]:
def get_eval_report2(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    acc = accuracy_score(labels, preds)
    
    # macro
    f1 = f1_score(labels, preds, average='macro')
    p = precision_score(labels, preds, average='macro')
    r = recall_score(labels, preds, average='macro')
    
    # not
    f1_0 = f1_score(labels, preds, average='binary', pos_label=0)
    p_0 = precision_score(labels, preds, average='binary', pos_label=0)
    r_0 = recall_score(labels, preds, average='binary', pos_label=0)
    
    # off
    f1_1 = f1_score(labels, preds, average='binary', pos_label=1)
    p_1 = precision_score(labels, preds, average='binary', pos_label=1)
    r_1 = recall_score(labels, preds, average='binary', pos_label=1)
    
    return {
        "mcc": mcc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "acc" : acc,
        "f1" : f1,
        "precision" : p,
        "recall" : r,
        "p_not" : p_0,
        "r_not" : r_0,
        "f1_not" : f1_0,
        "p_off" : p_1,
        "r_off" : r_1,
        "f1_off" : f1_1
    }, get_mismatched(labels, preds)

#### Step 6: Load this pre-trained models and ensemble them

In [32]:
# Loading all files in the path with predictions
import os
path = 'oe2020-albert-A/oe2020_A/'
files = []
for r, d, f in os.walk(path):
    for file in f:
        if '.p' in file:
            files.append(os.path.join(r, file))
print(len(files))
for f in files:
    print(f)

0


In [33]:
test_sentences, test_ids = zip(*X_test)

In [34]:
len(test_sentences)

3887

# Only Albert

#### Step 7: Using the majority vote option, the predictions are computed

In [35]:
import pickle
preds = []
for f in files:
    if 'albert-xxlarge-v2' in f:
        preds.append(pickle.load(open( f, "rb" )))
    if 'albert-xxlarge-v1' in f:
        preds.append(pickle.load(open( f, "rb" )))
        
import numpy as np
merged_preds = np.concatenate(preds, axis = 1)
from collections import Counter
majority_preds = []
for i in range(merged_preds.shape[0]):
    majority_preds.append(Counter(merged_preds[i].astype(int)).most_common(1)[0][0])
mean_preds = merged_preds.mean(axis=1)
final_preds = majority_preds

ValueError: need at least one array to concatenate

In [None]:
lables = []
for i, t in enumerate(s_test.texts):
      lables.append(label2id[s_test.labels[i]])

#### Step 8: The Precision, Recall and F1 scores for labels {NOT, OFF} and overall Macro F1 and Accuracy are determined. 

In [None]:
#This code evaluated the predictions and calculates Precision, Recall and F1 scores for NOT and OFF labels.
# Also the Macro F1 and Accuracy for the entire ensemble
result, wrong = get_eval_report2(np.array(lables), final_preds)

In [None]:
print("NOT:" + "\t" +  "P: %s" %(str(round(result["p_not"]*100, 2))) + "\t" +  "R: %s" %(str(round(result["r_not"]*100, 2))) + "\t" +  "F1: %s" %(str(round(result["f1_not"]*100, 2))))
print("OFF:" + "\t" +  "P: %s" %(str(round(result["p_off"]*100, 2))) + "\t" +  "R: %s" %(str(round(result["r_off"]*100, 2))) + "\t" +  "F1: %s" %(str(round(result["f1_off"]*100, 2))))

In [None]:
print("F1: %s" %(str(round(result["f1"]*100, 2))) + "\t" + "ACC: %s" %(str(round(result["acc"]*100, 2))))

# Confusion Matrix Plots

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(np.array(lables), final_preds)
print(cm)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(np.array(lables), final_preds)
print(cm)

In [None]:
true_labels = [label2id[l] for l in test_ids]

In [None]:
true_labels = [label2id[l] for l in test_ids]

In [None]:
import sklearn.metrics as sklm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from sklearn.base import BaseEstimator, ClassifierMixin

class IdentityEstimator(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.is_fitted_ = True
        self.classes_ = [0, 1]

    def predict(self, X):
        return X
estim = IdentityEstimator()

def plot_conf_mat(true_labels, predicted_labels, label_names, plot_title):
    matplotlib.rcParams.update({'font.size': 20})
    disp = sklm.plot_confusion_matrix(estim, np.array(predicted_labels), true_labels, 
                                      cmap=plt.cm.Blues, values_format = '.5g',
                                      display_labels=label_names)
    disp.ax_.set_title(plot_title)
    plt.show()

In [None]:
plot_conf_mat(np.array(lables), final_preds, ['NOT', 'OFF'], 'Task A')