In [None]:
import pandas as pd
import numpy as np
import torch

from sklearn.metrics import f1_score,  precision_score, recall_score, hamming_loss

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
import evaluate

  from .autonotebook import tqdm as notebook_tqdm





## **Data Preparation**

In [None]:
# Load the datasets
data_path = "../dataset"
df_train_texts = pd.read_csv(f'{data_path}/train_texts.csv', delimiter=';')
df_val_texts = pd.read_csv(f'{data_path}/val_texts.csv', delimiter=";")
df_test_texts = pd.read_csv(f'{data_path}/test_texts.csv', delimiter=";")
df_y_train = pd.read_csv(f'{data_path}/y_train.csv', delimiter=';')
df_y_val = pd.read_csv(f'{data_path}/y_val.csv', delimiter=";")
df_y_test = pd.read_csv(f'{data_path}/y_test.csv', delimiter=";")

print(df_train_texts.shape)
df_train_texts.head()

(2773, 1)


Unnamed: 0,text
0,Er is een teek op mijn been. Ik ben bang dat d...
1,roodheid
2,schilfering
3,Ik heb gisteren naar het bos geweest en zie nu...
4,Ik voelde iets prikken


In [None]:
print(df_y_val.shape)
df_y_val.iloc[:, :5].head()

(604, 74)


Unnamed: 0,"Niet lekker voelen, algehele malaise",Beenklachten,Bloedneus,Misselijkheid en overgeven,Brandwond
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0


In [None]:
# Prepare data to be np arrays
# X must be 2D np.ndarray and y must be 2D binary np.ndarray
train_texts = df_train_texts['text'].values
val_texts = df_val_texts['text'].values
test_texts = df_test_texts['text'].values

y_train = df_y_train.values
y_val = df_y_val.values
y_test = df_y_test.values

assert train_texts.shape[0] == y_train.shape[0], "Mismatch in train data and labels"
assert test_texts.shape[0] == y_test.shape[0], "Mismatch in test data and labels"
print(train_texts.shape, y_train.shape, test_texts.shape)
train_texts

(2773,) (2773, 74) (597,)


array(['Er is een teek op mijn been. Ik ben bang dat die er al een tijdje op heeft gezeten',
       'roodheid', 'schilfering', ...,
       'Vannacht met slapen denk ik gekke beweging gemaakt, want mn nek is nu helemaal stijf kan niet meer naar rechts kijken',
       'Heb al langere tijd pijn in mn nek, krijg dan soms tintelingen over mijn arm, heb dan ook minder kracht in mijn arm',
       'Doet zeer als ik mn hoofd beweeg'], dtype=object)

## **Model Building: Transformers**

In [None]:
if torch.cuda.is_available():
    print("CUDA is available. You can use GPU.")
else:
    print("CUDA is not available. Check your GPU setup.")

# Choose the GPU device, for example, GPU 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
model_name = "GroNLP/bert-base-dutch-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=y_train.shape[1])

model.to(device)

tokenizer_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/437M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['classifier.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize each text individually and aggregate the results
def tokenize_and_aggregate(texts):
    tokenized_texts = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
    for text in texts:
        tokenized_text = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
        tokenized_texts['input_ids'].append(tokenized_text['input_ids'][0])
        tokenized_texts['attention_mask'].append(tokenized_text['attention_mask'][0])
        if 'token_type_ids' in tokenized_text:
            tokenized_texts['token_type_ids'].append(tokenized_text['token_type_ids'][0])
    return tokenized_texts

tokenized_train_data = tokenize_and_aggregate(train_texts)

In [None]:
# Prepare the dataset
class PrepareDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = PrepareDataset(tokenized_train_data, y_train)

In [None]:
# Tokenize the val texts
tokenized_val_texts = tokenize_and_aggregate(val_texts)

# Prepare the val dataset
val_dataset = PrepareDataset(tokenized_val_texts, y_val)

In [None]:
# Load metric

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Apply sigmoid to logits
    predictions = 1 / (1 + np.exp(-logits))
    # Convert to binary values (0 or 1) with a threshold, e.g., 0.5
    threshold = 0.5
    predictions = (predictions > threshold).astype(int)

    # Compute metrics for each label and then average (micro)
    f1_macro = f1_score(labels, predictions, average='micro')
    precision = precision_score(labels, predictions, average='micro')
    recall = recall_score(labels, predictions, average='micro')
    hamming_loss_value = hamming_loss(labels, predictions)

    return {
        'f1': f1_macro,
        'precision': precision,
        'recall': recall,
        'hamming_loss': hamming_loss_value,
    }

In [None]:
# Training arguments
training_args = TrainingArguments(
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
1,No log,0.042389,0.506912,0.755495,0.381415,0.012151,19.6761,30.24,3.812
2,0.037300,0.038545,0.531646,0.763636,0.407767,0.011765,19.7113,30.186,3.805
3,0.025200,0.036886,0.618506,0.745597,0.528433,0.010675,20.303,29.306,3.694
4,0.025200,0.036064,0.616013,0.749503,0.522885,0.010675,19.6727,30.245,3.812
5,0.018400,0.034126,0.643933,0.751852,0.563107,0.010198,19.679,30.235,3.811
6,0.013300,0.03451,0.65322,0.741197,0.583911,0.010152,19.611,30.34,3.824
7,0.013300,0.033632,0.669237,0.753472,0.601942,0.009743,19.7224,30.169,3.803
8,0.010000,0.033354,0.674942,0.765845,0.603329,0.009516,19.6973,30.207,3.808
9,0.008400,0.033306,0.68393,0.758446,0.622746,0.009425,19.7051,30.195,3.806
10,0.008400,0.033429,0.682515,0.763293,0.617198,0.009403,19.6566,30.27,3.816


TrainOutput(global_step=3470, training_loss=0.017206534391177836, metrics={'train_runtime': 2804.8931, 'train_samples_per_second': 9.89, 'train_steps_per_second': 1.237, 'total_flos': 7303418986045440.0, 'train_loss': 0.017206534391177836, 'epoch': 10.0})

In [None]:
# Tokenize the test texts
tokenized_test_texts = tokenize_and_aggregate(test_texts)

# Prepare the test dataset
test_dataset = PrepareDataset(tokenized_test_texts, y_test)

In [None]:
# Evaluate the model
results = trainer.evaluate(test_dataset)

# Filter the dictionary to include only the desired metrics
filtered_results = {key: results[key] for key in results if key in ['eval_loss', 'eval_f1', 'eval_precision', 'eval_recall', 'eval_hamming_loss']}

# Convert to DataFrame and Transpose it
results_df = pd.DataFrame([filtered_results]).T
results_df.columns = ['Value']  # You can rename the column header as needed

results_df

Unnamed: 0,Value
eval_loss,0.031674
eval_f1,0.692654
eval_precision,0.766169
eval_recall,0.632011
eval_hamming_loss,0.009158


In [None]:
# Save the model
model_save_path = "../models/trf/model_trf"
model.save_pretrained(model_save_path)

# Save the tokenizer in the same way, if we need it later
tokenizer_save_path = "../models/trf/tokenizer_trf"
tokenizer.save_pretrained(tokenizer_save_path)

## **Inference**

In [None]:
complaints_to_index = {complaint: index for index, complaint in enumerate(df_y_test.columns)}
complaints_to_index

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(f"../models/trf/model_trf")
tokenizer = AutoTokenizer.from_pretrained(f"../models/trf/tokenizer_trf")

In [None]:
input_text = test_texts[4]
print(input_text)
inputs = tokenizer(input_text, return_tensors="pt")

ik heb al een week lang last van mijn oren


In [None]:
model.eval()  # Set the model to evaluation mode

with torch.no_grad():
    outputs = model(**inputs)
logits = outputs.logits
predictions = (torch.sigmoid(logits).numpy() > 0.5).astype(int)
predictions

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
predicted_labels = []

for idx, label in enumerate(df_y_test.columns):
    if predictions[0][idx]:
        predicted_labels.append(label)

predicted_labels

['Oorklachten']