In [1]:
from typing import Literal

import pyarrow.parquet as pq
import pandas as pd

In [2]:
# Since using AWS requires refreshing SSO credentials ever day, skip this for 
# now for quicker iteration
REFRESH_DATA_FROM_S3: bool = False

# Set to `False` to use whole data set, or set to an integer to specify how 
# many rows to include.
SUBSET: Literal[False] | int = 400

In [3]:
# Load data
if REFRESH_DATA_FROM_S3:
    raise NotImplementedError
else:
    df: pd.DataFrame = pq.read_table('../data/preprocessing_input.parquet') \
    .to_pandas() \
    .rename(columns={'replied_to': 'label'})
if SUBSET is not False:
    df = df.iloc[:SUBSET, ]

In [4]:
df = df[['label', 'body']]
df['label'] = df.label.astype(int)
df.head(3)

Unnamed: 0,label,body
183cbf647a8af438,0,Good morning families -\r\n\r\nThe technician ...
183cb6c70075704b,0,[http://images.myhealthyfinances.com/EloquaIma...
183ca91acbdcd4f8,0,\n\n\n\n\nWarning:\r\n Low Battery\r\n \...


In [5]:
df.label.value_counts()

0    387
1     13
Name: label, dtype: int64

In [6]:
from datasets import Dataset, DatasetDict

split_index = len(df) // 2
ds_train: Dataset = Dataset.from_pandas(df.iloc[:split_index, :])
ds_test: Dataset = Dataset.from_pandas(df.iloc[split_index:, :])
ds_test

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['label', 'body', '__index_level_0__'],
    num_rows: 200
})

In [7]:
ds = DatasetDict()
ds['train'] = ds_train
ds['test'] = ds_test
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'body', '__index_level_0__'],
        num_rows: 200
    })
    test: Dataset({
        features: ['label', 'body', '__index_level_0__'],
        num_rows: 200
    })
})

In [8]:
from transformers import AutoTokenizer
checkpoint = "bert-base-cased"

def preprocess_function(rows):
    text_field: str = "body"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    return tokenizer(
        rows[text_field], 
        padding='max_length',
        truncation=True, 
    )

ds_tokenized: DatasetDict = ds.map(preprocess_function, batched=True)
ds_tokenized

100%|██████████| 1/1 [00:00<00:00,  3.42ba/s]
100%|██████████| 1/1 [00:00<00:00,  3.79ba/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'body', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 200
    })
    test: Dataset({
        features: ['label', 'body', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [9]:
# from transformers import DataCollatorWithPadding
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

id2label: dict[int, str] = {0: "No", 1: "Yes"}
label2id: dict[str, int] = {value: key for key, value in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [11]:
import evaluate
import numpy as np
import torch
from torch.nn import functional as f

def compute_auc(eval_pred):
    metric = evaluate.load('roc_auc')

    logits, labels = eval_pred
    probabilities = f.softmax(
        torch.from_numpy(logits),
        dim=-1
    )
    return metric.compute(
        prediction_scores=probabilities[:, 1], 
        references=labels
    )

In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="huggingface-output", 
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_tokenized['train'],
    eval_dataset=ds_tokenized['test'],
    compute_metrics=compute_auc,
)
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, body. If __index_level_0__, body are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 200
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 75
  Number of trainable parameters = 108311810
 33%|███▎      | 25/75 [06:11<11:31, 13.84s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, body. If __index_level_0__, body are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
   

{'eval_loss': 0.12983709573745728, 'eval_roc_auc': 0.38051282051282054, 'eval_runtime': 146.0723, 'eval_samples_per_second': 1.369, 'eval_steps_per_second': 0.171, 'epoch': 1.0}


 67%|██████▋   | 50/75 [14:25<05:49, 13.99s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, body. If __index_level_0__, body are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
                                               
 67%|██████▋   | 50/75 [16:51<05:49, 13.99s/it]

{'eval_loss': 0.12911292910575867, 'eval_roc_auc': 0.6523076923076924, 'eval_runtime': 145.2766, 'eval_samples_per_second': 1.377, 'eval_steps_per_second': 0.172, 'epoch': 2.0}


100%|██████████| 75/75 [22:27<00:00, 13.15s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, body. If __index_level_0__, body are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
                                               
100%|██████████| 75/75 [24:50<00:00, 13.15s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 75/75 [24:50<00:00, 19.87s/it]

{'eval_loss': 0.1332726925611496, 'eval_roc_auc': 0.6553846153846153, 'eval_runtime': 142.7567, 'eval_samples_per_second': 1.401, 'eval_steps_per_second': 0.175, 'epoch': 3.0}
{'train_runtime': 1490.0833, 'train_samples_per_second': 0.403, 'train_steps_per_second': 0.05, 'train_loss': 0.19033419291178386, 'epoch': 3.0}





TrainOutput(global_step=75, training_loss=0.19033419291178386, metrics={'train_runtime': 1490.0833, 'train_samples_per_second': 0.403, 'train_steps_per_second': 0.05, 'train_loss': 0.19033419291178386, 'epoch': 3.0})