In [2]:
from nocola_to_df import get_df

# get df for all splits:
train_df = get_df(split="train")
dev_df = get_df(split="dev")
test_df = get_df(split="test")

In [3]:
# from constants import error_mapping

# class_labels = train_df.label.unique()
# # Iterate over each class label
# for label in class_labels:
#     # Filter the dataframe for the current class label and retrieve a sample of 5 sentences
#     class_samples = train_df[train_df.label == label].sample(5).text
#     # Print the class label and the corresponding sentences
#     print(label, "---", error_mapping[label])
#     texts = class_samples.tolist()
#     for i, text in enumerate(texts):
#         print(f"{i+1}. {text}")
#     print()

In [4]:
# encode labels to numbers:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train_df.label)
train_df["label"] = le.transform(train_df.label)
dev_df["label"] = le.transform(dev_df.label)
test_df["label"] = le.transform(test_df.label)

In [5]:
# find the label name with the least amount of samples
label_name = train_df.label.value_counts().idxmin()
min_samples = train_df.label.value_counts().min()
label_name, min_samples

(1, 472)

In [6]:
train_df.label.value_counts()

label
2     15231
13    12614
4     10491
6      9955
9      9705
11     8250
5      4752
10     1876
0      1632
3      1356
12     1160
7      1070
8       851
1       472
Name: count, dtype: int64

In [7]:
import pandas as pd

def get_reduced_dataframe(df, min_samples=8):
    # proportion of each label, based on the minimum label
    proportion = df.label.value_counts() / min_samples
    # normalize it, so the lowest value is 1:
    proportion = proportion  / proportion.min()
    # multiply each by 8 (so that we ensure at least 8 samples)
    proportion = proportion * min_samples
    proportion = proportion.apply(round)
    proportion = proportion.to_dict()
    
    subset_dfs = []
    for label, desired_count in proportion.items():
        label_df = df[df['label'] == label]
        subset_dfs.append(label_df.sample(n=desired_count, random_state=42))
    subset = pd.concat(subset_dfs, axis=0).reset_index(drop=True)
    return subset

subset_train = get_reduced_dataframe(train_df, min_samples=20)

In [8]:
from datasets import Dataset

dataset = Dataset.from_pandas(subset_train)
dataset_test = Dataset.from_pandas(test_df)
dataset_eval = Dataset.from_pandas(dev_df)

In [9]:
dataset.shape

(3363, 2)

In [32]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer

from models import SBERT_MODEL

num_classes = len(dataset.unique("label"))

sfit_model = SetFitModel.from_pretrained(
    SBERT_MODEL,
    use_differentiable_head=True,
    head_params={"out_features": num_classes},
)
#Note: If you use the differentiable SetFitHead classifier head, it will automatically use BCEWithLogitsLoss for training.
# The prediction involves a sigmoid after which probabilities are rounded to 1 or 0.
# Furthermore, the "one-vs-rest" and "multi-output" multi-target strategies are equivalent for the differentiable SetFitHead.


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [33]:
import torch
torch.__version__

'2.1.0.dev20230412'

In [34]:
sfit_model.to("mps")

SetFitModel()

In [35]:
set(dataset["label"])

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}

In [36]:
# Create trainer
trainer = SetFitTrainer(
    model=sfit_model,
    train_dataset=dataset,
    eval_dataset=dataset_eval,
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=16,
    num_iterations=20, # Number of text pairs to generate for contrastive learning
    num_epochs=1, # Number of epochs to use for contrastive learning
)
trainer.freeze()
trainer.train()

Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 134520
  Num epochs = 1
  Total optimization steps = 8408
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8408 [00:00<?, ?it/s]

In [None]:
trainer.unfreeze(keep_body_frozen=True)
trainer.train(
    num_epochs=25,
    batch_size=16,
    body_learning_rate=1e-5,  # LR of body
    learning_rate=1e-2,  # LR of head
    l2_weight=0.0
)

In [48]:
metrics = trainer.evaluate()

***** Running evaluation *****


In [49]:
metrics

{'accuracy': 0.19243119266055045}

In [51]:
preds = sfit_model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])
preds

tensor([2, 6])

In [52]:
le.inverse_transform(preds)

array(['F', 'ORT'], dtype=object)