<a href="https://colab.research.google.com/github/walexi/mila_dump/blob/master/Copy_of_BiasCorp_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U pytorch-lightning comet-ml transformers optuna wandb

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch.nn.functional as F
import wandb
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, r2_score, average_precision_score, precision_recall_fscore_support
from sklearn.preprocessing import MultiLabelBinarizer
from dataclasses import dataclass
import random
import pandas as pd
import numpy as np
import math
import optuna
from ast import literal_eval
import copy
# from datasets import  load_metric
from transformers import BertModel, BertTokenizerFast, BertLayer
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

from hopfield import HopfieldLayer, Hopfield, HopfieldPooling, HopfieldCore
# %matplotlib inline
# plt.set_cmap('jet')

In [None]:
DATA_ROOT = 'bias_corp.csv'
BATCH_SIZE = 8
NUM_WORKERS = 3
MAX_LEN = 512
TRIALS = 3
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
TOKENIZER = BertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)
BASE_MODEL = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

DATA = pd.read_csv(DATA_ROOT)
df = DATA

In [None]:
df['confidence_sum'] = df.apply(lambda row: sum([row['confidence_1'], row['confidence_2'], row['confidence_3']]), axis=1)
df['class_indices'] = df.apply(lambda row: np.asarray([row['bias_1'], row['bias_2'], row['bias_3']]), axis=1)
df['conf_scores'] = df.apply(lambda row: np.asarray([row['confidence_1'], row['confidence_2'], row['confidence_3']])/row['confidence_sum'], axis=1)

In [None]:
import re

regex = re.compile("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)|(\\n)|(\\r)|(\\t)|(\<+?.+?\>)", re.IGNORECASE)
regex2 = re.compile("(\\')")

In [None]:
df.loc[:, 'text'] = df.text.apply(lambda ele: regex.sub("",ele))
df.loc[:, 'text'] = df.text.apply(lambda ele: regex2.sub("'",ele))

In [None]:
DATA = df

In [None]:
@dataclass
class Data:
    train: pd.DataFrame
    val: pd.DataFrame
    test: pd.DataFrame

In [None]:
def shuffle_dist(dataframe: pd.DataFrame, split_ratio: float=0.6) -> (Data, str):
    """randomly split dataset into train and val set based on the source column
    Parameters
    ----------
    dataframe : pd.Dataframe
    dataset in pandas dataframe type
    split_ratio : float
    for a given source, percent of entries to split

    Returns
    -------
    Data
    dataclass with fields train, val, and test

    """
    coin = random.choice([0,1])
    type_ = ""
    if coin:
        type_ = 'Fox'
        ratio = int(len(dataframe[dataframe['source']==type_]) * split_ratio)
        train = pd.concat([ dataframe[dataframe['source']=='BB'], dataframe[dataframe['source']==type_][:ratio] ])
        val = dataframe[dataframe['source']==type_][ratio:]
        test = dataframe[dataframe['source']=='Youtube']
    else:
        type_ = 'BB'
        ratio = int(len(dataframe[dataframe['source']==type_]) * split_ratio)
        train = pd.concat([ dataframe[dataframe['source']=='Fox'], dataframe[dataframe['source']==type_][:ratio] ])
        val = dataframe[dataframe['source']==type_][ratio:]
        test = dataframe[dataframe['source']=='Youtube']

    return Data(train,val,test), type_


In [None]:
def one_hot(conf_scores, cls_indices):
  empty = torch.zeros((1,6), dtype=torch.float)
  cls = torch.tensor([cls_indices], dtype=torch.long)
  conf = torch.tensor([conf_scores], dtype=torch.float)

  return empty.scatter_(1, cls, conf, reduce='add')


In [None]:
class BiasDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.config = {
            'max_len': max_len,
            'return_attention_mask': True,
            'padding': 'max_length',
            'truncation': True,
            'return_tensors': 'pt',
            'return_token_type_ids': False
        }

    def __len__(self):

        return len(self.df)

    def __getitem__(self, item):
        entry = self.df.iloc[item]
        text = entry.text
        config = {
            'max_length': self.max_len,
            'return_attention_mask': True,
            'padding': 'max_length',
            'truncation': True,
            'return_tensors': 'pt',
            'return_token_type_ids': False
        }
        encoding = self.tokenizer(text, **config)

        try:
            conf_scores = entry.conf_scores
            indices = entry.class_indices
            target = one_hot(conf_scores, indices)

            return {
              'text': text,
              'input_ids': encoding['input_ids'],
              'attention_mask' : encoding['attention_mask'],
              'target': target
            }

        except KeyError:

            return {
              'text': text,
              'input_ids': encoding['input_ids'],
              'attention_mask': encoding['attention_mask']
            }

In [None]:
class BackBone(nn.Module):

    def __init__(self, base, trials=None, isFrozen=True, useHopfieldPool=False):
        super(BackBone, self).__init__()

        self.bert = base

        self.useHopfieldPool = useHopfieldPool

        if isFrozen:
            for layer in self.bert.parameters():
                layer.requires_grad = False


            # hopfield = EncoderLayer(bert.config,0.2)

            # self.bert = replaceLayer(bert, hopfield, [7,8,9])
            # # self.bi_lstm = return sequences and apply dropout
            # # max_pool
            # # dense then dropout and dense_output
        if useHopfieldPool:
          output_size = self.bert.config.hidden_size
          self.hopfield_pool = HopfieldPooling(self.bert.config.hidden_size, hidden_size=32, output_size=output_size, num_heads=1)
        else:
          output_size = 512
          self.lin0 = nn.Linear(self.bert.config.hidden_size, output_size)

        self.dropout = nn.Dropout(trials.suggest_uniform('dropout_rate', 0.2, 0.5))

        self.lin1 = nn.Linear(output_size, 6)

    def forward(self, input_ids, attention_mask = None):

        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_attentions=False
            )

        out = output.pooler_output

        if self.useHopfieldPool:
          out = out.unsqueeze(1)
          out = self.hopfield_pool(out)
        else:
          out = self.lin0(out)

        out = self.lin1(self.dropout(out))

        return out, output.attentions


In [None]:
class Trainer:
    def __init__(self, model, tokenizer, dataframe, criterion, batch_size, trial=None):

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.model  = model.to(self.device)

        self.trial = trial

        self.batch_size = batch_size

        self.bce_loss = criterion

        self.label_binarizer = MultiLabelBinarizer(classes=[0,1,2,3,4,5])

        self._init(dataframe, tokenizer)


    def _init(self, df, tokenizer):

        dataset, self.s_type = shuffle_dist(df) #report what source is used for val and log with trial number

        train_ds = BiasDataset(dataset.train, max_len=MAX_LEN, tokenizer=tokenizer)
        val_ds = BiasDataset(dataset.val, max_len=MAX_LEN, tokenizer=tokenizer)
        test_ds  = BiasDataset(dataset.test, max_len=MAX_LEN, tokenizer=tokenizer)

        self.train_dl = DataLoader(train_ds, num_workers=NUM_WORKERS, batch_size= self.batch_size, shuffle=True)
        self.test_dl = DataLoader(test_ds, num_workers=NUM_WORKERS, batch_size= self.batch_size, shuffle=True)
        self.val_dl = DataLoader(val_ds, num_workers=NUM_WORKERS, batch_size= self.batch_size, shuffle=False)


    def compute_metrics(self, logits, targets, end_of_epoch=False, isTrain=True):

        if end_of_epoch and not isTrain:
            # transform to label indicators for each topk
            pred_matrix = [self.label_binarizer.fit_transform(logits[i]) for i in range(3)]
            targ_matrix = [self.label_binarizer.fit_transform(targets[i]) for i in range(3)]

            # ap for each class in an array of class length
            average_precision = [ average_precision_score(targ_matrix[i], pred_matrix[i], average=None) for i in range(3) ]
            # print(pred_matrix[0].shape) (8392, 2)
            # mean average precision
            map_ = [average_precision_score(targ_matrix[i], pred_matrix[i], average='macro') for i in range(3)]

            # array of tuple(precision recall f1_score) for each class
            prec_recall_f1 = [ [ (precision_recall_fscore_support(targ_matrix[j][:,i], pred_matrix[j][:,i], average='micro'))[0:3] for i in range(6)] for j in range(3)]

            return average_precision, map_, prec_recall_f1

        loss = self.bce_loss(logits, targets)

        if not isTrain:

            prob = F.softmax(logits, dim=1)
            prob = prob.cpu()
            # r2_score = r2_score(targets.numpy(), prob, multioutput='raw_values')
            topk_preds = [0]*3
            topk_targs = [0]*3

            for i in range(3):
                topk_pred = prob.topk(i+1).indices.numpy()
                topk_targ = targets.cpu().topk(i+1).indices.numpy()
                topk_preds[i]=topk_pred
                topk_targs[i]=topk_targ

            return loss, topk_preds, topk_targs

        return loss

    def fit(self, epochs, lr):

        config = wandb.config
        config.batch_size = self.batch_size
        config.epochs = epochs
        config.lr = lr
        config.no_cuda = False
        config.log_interval = 10

        optimizer = AdamW(self.model.parameters(), lr=config.lr, correct_bias=False)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(self.train_dl)* epochs)

        for epoch in tqdm(range(config.epochs)):

            train_loss  = self.train(optimizer, scheduler)

            val_loss, avg_prec, map_, prec_rec_f1 = self.val()
              # self.test() # may run after x epochs given the best model so far

            wandb.log({'train_loss_epoch': np.mean(train_loss), 'epoch': epoch, trial: self.trial.number },commit=False)

            wandb.log( {'val_loss_epoch': np.mean(val_loss), 'average_precision': avg_prec,'mean_avg_precision': map_, 'precision_recall_f1': prec_rec_f1, 'epoch': epoch, trial: self.trial.number, 'source_val': self.s_type })

        sample = next(iter(self.train_dl))
        flops_count, params_count = get_model_complexity_info(self.model,
                                         tuple(sample['input_ids'].squeeze(1).size()),
                                         input_constructor=lambda x: { 'input_ids': sample['input_ids'].squeeze(1) },
                                         print_per_layer_stat=False, as_strings=False, verbose=False)

        return val_loss, map_, flops_count

    def train(self, optimizer, scheduler):
        self.model.train()

        total_loss = []
        steps = 0
        for batch_idx, entry in enumerate(self.train_dl):

            input_ids = entry['input_ids'].to(self.device)
            attention_mask = entry['attention_mask'].to(self.device)
            targets = entry['target'].to(self.device)

            optimizer.zero_grad()

            logits, _ = self.model(input_ids.squeeze(1), attention_mask.squeeze(1))

            loss = self.compute_metrics(logits, targets.squeeze(1))

            loss.backward()

            optimizer.step()

            scheduler.step()


            steps += 1

            if ((steps+1)%25==0):
                wandb.log({'train_loss_batch': loss.item(), 'steps': steps, 'trial_number': self.trial.number })

            total_loss.append(loss.item())

        return total_loss


    def val(self):

        self.model.eval()

        total_loss = []
        topk_predictions = None
        topk_targets = None

        steps = 0

        with torch.no_grad():

            for entry in self.val_dl:

                steps += 1

                input_ids = entry['input_ids'].to(self.device)
                attention_mask = entry['attention_mask'].to(self.device)
                targets = entry['target'].to(self.device)

                logits, _ = self.model(input_ids.squeeze(1), attention_mask.squeeze(1))
                # k:int -> arr

                loss, topk_preds, topk_targs = self.compute_metrics(logits, targets.squeeze(1), isTrain=False)

                if ((steps+1)%25==0):
                    wandb.log({'val_loss_batch': loss.item(), 'steps': steps, 'trial_number': self.trial.number })

                if topk_predictions is None:
                    topk_predictions = topk_preds
                else:
                    topk_predictions = [np.concatenate((prev, curr), axis=0) for prev, curr in zip(topk_predictions,topk_preds)]

                if topk_targets is None:
                    topk_targets = topk_targs
                else:
                    topk_targets = [np.concatenate((prev, curr), axis=0) for prev, curr in zip(topk_targets,topk_targs)]

                total_loss.append(loss.item())

            avg_prec, map_, prec_rec_f1 = self.compute_metrics(topk_predictions, topk_targets, end_of_epoch=True, isTrain=False)

        return total_loss, avg_prec, map_, prec_rec_f1

    def test(self):
        # self.model.eval()
        pass

In [None]:
def objective(trial):

    criterion = nn.BCEWithLogitsLoss()

    model = BackBone(BASE_MODEL, trial)

    wandb.watch(model, criterion, log='all')

    trainer = Trainer(model, TOKENIZER, DATA, criterion, BATCH_SIZE, trial)

    lr = trial.suggest_loguniform('learning_rate', 2e-5, 5e-5)

    EPOCHS = 1

    val_loss, map_, flops_count = trainer.fit(EPOCHS, lr)


    return val_loss, map_, flops_count




In [None]:
def main():


#     wandb.login(key="")
    wandb.init(project="final-paper", id='my', save_code=True, name='temp', reinit=True, entity="walexi4great")
#     %env WANDB_LOG_MODEL=true
#     %env WANDB_WATCH='all'

#     DATA_ROOT = '/home/walexi/scratch/bias/final/dataset/data.csv'
#     BATCH_SIZE = 8
#     NUM_WORKERS = 3
#     MAX_LEN = 512
#     TRIALS = 3
#     PRE_TRAINED_MODEL_NAME = '/home/walexi/scratch/bias/final/bert_base_cased'
#     TOKENIZER = BertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)
#     BASE_MODEL = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

#     DATA = pd.read_csv(DATA_ROOT)

    study = optuna.create_study(study_name='Final Paper', directions=['minimize', 'maximize', 'minimize'])

    study.optimize(objective, n_trials=TRIALS)

    return study

#     print("Number of finished trials: {}".format(len(study.trials)))

#     print("Best trial:")
#     trial = study.best_trial

#     print("  Value: {}".format(trial.value))

#     print("  Params: ")
#     for key, value in trial.params.items():
#         print("    {}: {}".format(key, value))

#     # Visualize the optimization history.
#     plot_optimization_history(study).show()

#     # Visualize the learning curves of the trials.
#     plot_intermediate_values(study).show()

#     # Visualize high-dimensional parameter relationships.
#     plot_parallel_coordinate(study).show()

#     # Select parameters to visualize.
#     plot_parallel_coordinate(study, params=["lr_init", "n_units_l0"]).show()

#     # Visualize hyperparameter relationships.
#     plot_contour(study).show()

#     # Select parameters to visualize.
#     plot_contour(study, params=["n_units_l0", "n_units_l1"]).show()

#     # Visualize individual hyperparameters.
#     plot_slice(study).show()

#     # Select parameters to visualize.
#     plot_slice(study, params=["n_units_l0", "n_units_l1"]).show()

#     # Visualize parameter importances.
#     plot_param_importances(study).show()



In [None]:
res = main()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


[32m[I 2021-02-24 14:01:23,964][0m A new study created in memory with name: Final Paper[0m
  0%|          | 0/1 [00:00<?, ?it/s]