# Set Benchmarking Parameters

Pick model names based on https://huggingface.co/models. Then just run all cells.

In [1]:
dataset_path = r"data/feedback_prize/train.csv"
batch_size = 2
epochs = 5
learning_rate = 2e-5
weight_decay = 0.01
# model_checkpoints = ["distilbert-base-uncased", "bert-base-uncased", "roberta-base", "distilgpt2"]
model_checkpoints = ["distilgpt2"]

# Imports

In [2]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import PreTrainedModel
from transformers.pipelines.pt_utils import KeyDataset

from tqdm.auto import tqdm

from datasets import Dataset, DatasetDict, load_dataset
from datasets import load_metric

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import pandas as pd
import numpy as np
import logging
from glob import glob
from os import path

from IPython.display import HTML, display

import torch

In [3]:
# set up gpu device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = torch.device(device)
device

device(type='cuda', index=0)

# Load the Dataset

In [4]:
df = pd.read_csv(dataset_path, header=0, encoding= 'unicode_escape')

df = df.set_index("id")
df = df.rename(columns={"discourse_text": "text"})

df["discourse_type"] = pd.Categorical(df["discourse_type"])
df["label"] = df["discourse_type"].cat.codes

df

Unnamed: 0_level_0,discourse_id,discourse_start,discourse_end,text,discourse_type,discourse_type_num,predictionstring,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,4
423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,5
423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75,3
423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...,3
423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...,0
...,...,...,...,...,...,...,...,...
4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...,3
4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...,3
4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seekÂ multiple opinions instea...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838,5
4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to helpÂ you make ...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...,3


In [5]:
# define category_codes to use for labelling in the model id2label field
category_codes = dict(zip(range(df["discourse_type"].cat.categories.size),df["discourse_type"].cat.categories))
category_codes

{0: 'Claim',
 1: 'Concluding Statement',
 2: 'Counterclaim',
 3: 'Evidence',
 4: 'Lead',
 5: 'Position',
 6: 'Rebuttal'}

In [6]:
dataset = Dataset.from_pandas(df[["text", "label"]])

train_test_dataset = dataset.train_test_split(test_size=0.3)
test_validation_dataset = train_test_dataset["test"].train_test_split(test_size=0.333)

train_test_valid_dataset = DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_validation_dataset['train'],
    'valid': test_validation_dataset['test']})

train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 101005
    })
    test: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 28873
    })
    valid: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 14415
    })
})

# Downstream Retraining Loop

In [7]:
def compute_metrics(eval_preds):
    metric_acc = load_metric("accuracy")
    metric_prec = load_metric("precision")
    metric_recall = load_metric("recall")
    metric_f1 = load_metric("f1")
    
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    
    acc = metric_acc.compute(predictions=predictions, references=labels)
    prec = metric_prec.compute(predictions=predictions, references=labels, average="weighted")
    recall = metric_recall.compute(predictions=predictions, references=labels, average="weighted")
    f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted")
    kappa = cohen_kappa_score(predictions, labels)

    return {"accuracy": acc['accuracy'], "precision": prec['precision'],
            "recall": recall['recall'], "f1": f1['f1'], "kappa": kappa}

In [8]:
for model_checkpoint in model_checkpoints:
    # instantiate a model and assign it to a gpu/cpu device
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, 
                                                               num_labels=len(category_codes)).to(device)
    # instantiate tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
        
    def preprocess_function(examples):        
        return tokenizer(examples["text"], truncation=True, padding=False)
    
    # encode dataset using tokenizer
    encoded_dataset = train_test_valid_dataset.map(preprocess_function, batched=True)
    columns_to_return = ['input_ids', 'label', 'attention_mask']
    encoded_dataset.set_format(type='torch', columns=columns_to_return)
    
    
    # training arguments
    model_name = model_checkpoint.split("/")[-1]
    args = TrainingArguments(
        f"models_gitignored/{model_name}-finetuned-sentence-classification",
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        load_best_model_at_end=True,
        logging_dir = f'logs/{model_name}-finetuned-sentence-classification/save_metrics' # save directory for save_metrics() files
    )

    # trainer
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    # begin training
    torch.cuda.empty_cache()
    train_result = trainer.train()
    
    # log train results (2 different ways just in case)
    metrics = train_result.metrics
    trainer.save_metrics("all", metrics)
    
    with open(f"logs/{model_name}-finetuned-sentence-classification/log_history.txt", "w") as fout:
        for obj in trainer.state.log_history:
            print(obj, file=fout)

Some weights of the model checkpoint at distilgpt2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using pad_token, but it is not set yet.


  0%|          | 0/102 [00:00<?, ?ba/s]

  0%|          | 0/29 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

The following columns in the training set  don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: id, text.
***** Running training *****
  Num examples = 101005
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 252515


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Kappa
1,1.0143,1.013975,0.775153,0.773902,0.775153,0.770148,0.694934
2,0.9232,1.067294,0.787275,0.786434,0.787275,0.784513,0.713879
3,0.7485,1.147496,0.787518,0.786181,0.787518,0.786001,0.715873
4,0.7771,1.211989,0.786894,0.786136,0.786894,0.785319,0.715444
5,0.6563,1.286081,0.78679,0.786927,0.78679,0.785826,0.715671


The following columns in the evaluation set  don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: id, text.
***** Running Evaluation *****
  Num examples = 28873
  Batch size = 2
Saving model checkpoint to models_gitignored/distilgpt2-finetuned-sentence-classification/checkpoint-50503
Configuration saved in models_gitignored/distilgpt2-finetuned-sentence-classification/checkpoint-50503/config.json
Model weights saved in models_gitignored/distilgpt2-finetuned-sentence-classification/checkpoint-50503/pytorch_model.bin
tokenizer config file saved in models_gitignored/distilgpt2-finetuned-sentence-classification/checkpoint-50503/tokenizer_config.json
Special tokens file saved in models_gitignored/distilgpt2-finetuned-sentence-classification/checkpoint-50503/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: id, text.
***** 