In [2]:
import sys
sys.path.append("../src")

In [3]:
import torch
from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from labels import LABELS, label2id, id2label
import pandas as pd

In [14]:
df = pd.read_csv('../data/processed/fineprint_clauses_train_ready.csv')
df.head()

Unnamed: 0,clause,label,source_policy,label_id
0,Communicate with you We use information we col...,none,youtube_privacy,0
1,"To have any such information deleted, submit t...",none,hulu_privacy,0
2,Because many of our services won’t function wi...,tracking,paypal_privacy,2
3,"For example, Uber uses identity verification t...",none,uber_driversdelivery_privacy,0
4,Car fleet information including capacities is ...,none,priceline_TOS,0


In [15]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label_id"],
    random_state=42,
)

len(train_df), len(test_df)

(10596, 2650)

## Creating huggingFace dataset

In [16]:
# keeping only clause + label_id
train_dataset = Dataset.from_pandas(train_df[["clause", "label_id"]])
test_dataset = Dataset.from_pandas(test_df[["clause", "label_id"]])

In [17]:
# load tokenizer and model

model_name = "distilbert-base-uncased"

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)


In [18]:
# tokenizer function
def tokenize_batch(batch):
    return tokenizer(
        batch["clause"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )


In [19]:
train_dataset = train_dataset.map(tokenize_batch, batched=True)
test_dataset  = test_dataset.map(tokenize_batch, batched=True)

Map: 100%|██████████| 10596/10596 [00:00<00:00, 21080.60 examples/s]
Map: 100%|██████████| 2650/2650 [00:00<00:00, 22316.13 examples/s]


In [20]:
# rename only if label_id exists
if "label_id" in train_dataset.column_names:
    train_dataset = train_dataset.rename_column("label_id", "labels")

if "label_id" in test_dataset.column_names:
    test_dataset = test_dataset.rename_column("label_id", "labels")


In [21]:
# remove hf auto columns
# ensures trainer only sees correct columns

def drop_index_cols(ds):
    idx_cols = [c for c in ds.column_names if c.startswith("__")]
    return ds.remove_columns(idx_cols) if idx_cols else ds

train_dataset = drop_index_cols(train_dataset)
test_dataset  = drop_index_cols(test_dataset)

In [22]:
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

In [23]:
# load the model 

num_labels = len(LABELS)

model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# metrics function

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
# training arguements
training_args = TrainingArguments(
    output_dir="../models/fineprint-distilbert",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=50,
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


