## Using roberta with synthetic data

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import os
import numpy as np
from seqeval.metrics import recall_score, precision_score, accuracy_score
from functools import partial
from datasets import concatenate_datasets

import sys
sys.path.append('..')
from src.data.dataloader import preprocess_data, get_dataset_from_path, get_train_val_test_split

In [2]:
class CFG:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42

    # model checkpoint
    model_name = 'roberta-base'

    #path to the training data
    train_data_path = os.path.join('..','data', 'raw', 'synthetic', 'mixtral.json')
    
    # path to the directory where the model will be saved
    local_path = os.path.abspath(os.path.abspath(''))
    target_dir = os.path.join(local_path,'..','models', 'mixtral_roberta')

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer'), 
        evaluation_strategy="epoch",
        )

In [3]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name, add_prefix_space=True)

In [4]:
data = get_dataset_from_path(CFG.train_data_path)
keys_to_flatten = ['input_ids', 'attention_mask', 'org_word_ids', 'document']
data = preprocess_data(data, tokenizer, label2id = CFG.label2id, keys_to_flatten=keys_to_flatten)

encoding the labels...


Map:   0%|          | 0/2355 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/2355 [00:00<?, ? examples/s]

flattening the data...


100%|██████████| 5/5 [00:03<00:00,  1.27it/s]


In [5]:
tokenizer.decode(data['input_ids'][3])

"<s> Rose - Mai Rodriguez | PIN # 3814374 \n 501 Andrea Highway \n North Tara, MP 38342 | rose-mairodriguez@comcast.net \n\n Introduction - Identifying the Challenge : \n\n As a User Experience Designer at a tech company, I am frequently confronted with complex challenges related to designing user - friendly interfaces that meet the needs and expectations of our diverse user base. One such challenge that I recently encountered was how to redesign our company's mobile app to better accommodate the needs of our aging user population. This challenge was significant and complex because older adults often have unique needs and abilities that are not typically considered in mainstream design practices. Additionally, this population has been historically underserved in the tech industry, making it even more critical to address their needs effectively. \n\n Selection of the Tool or Approach : \n\n To address this challenge, I chose to apply the User - Centered Design ( UCD ) approach, which is

In [12]:
data_train, data_eval, data_test = get_train_val_test_split(data, seed=CFG.seed)

In [13]:
def get_fbeta_score(precision, recall, beta=5.0):
        b2 = beta ** 2
        return (1 + b2) * ((precision * recall) / (b2 * precision + recall))

def compute_metrics(p, labels_list):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        fbeta_score = get_fbeta_score(precision, recall)

        results = {
            'recall': recall,
            'precision': precision,
            'fbeta_score': fbeta_score
            }
        
        return results

In [8]:
model = AutoModelForTokenClassification.from_pretrained(
    CFG.model_name, num_labels=len(CFG.id2label), id2label=CFG.id2label, label2id=CFG.label2id
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
trainer = Trainer(
    model=model,
    args=CFG.training_args,
    train_dataset=data_train,
    eval_dataset=data_eval,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/1707 [00:00<?, ?it/s]

Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/mixtral_roberta/trainer/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0318, 'learning_rate': 3.5354422964264796e-05, 'epoch': 0.88}


  0%|          | 0/64 [00:00<?, ?it/s]

{'eval_loss': 0.0008873186889104545, 'eval_recall': 0.9960274108650313, 'eval_precision': 0.997612652939421, 'eval_fbeta_score': 0.9960882885498399, 'eval_runtime': 19.8372, 'eval_samples_per_second': 25.457, 'eval_steps_per_second': 3.226, 'epoch': 1.0}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/mixtral_roberta/trainer/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0006, 'learning_rate': 2.0708845928529586e-05, 'epoch': 1.76}


  0%|          | 0/64 [00:00<?, ?it/s]

{'eval_loss': 0.00047899619676172733, 'eval_recall': 0.9972191876055219, 'eval_precision': 0.9985083532219571, 'eval_fbeta_score': 0.997268709341014, 'eval_runtime': 19.0672, 'eval_samples_per_second': 26.485, 'eval_steps_per_second': 3.357, 'epoch': 2.0}
{'loss': 0.0003, 'learning_rate': 6.0632688927943766e-06, 'epoch': 2.64}


  0%|          | 0/64 [00:00<?, ?it/s]

{'eval_loss': 0.0002759890630841255, 'eval_recall': 0.9990068527162578, 'eval_precision': 0.9987092930897538, 'eval_fbeta_score': 0.9989954048365718, 'eval_runtime': 19.1475, 'eval_samples_per_second': 26.374, 'eval_steps_per_second': 3.342, 'epoch': 3.0}
{'train_runtime': 1889.1764, 'train_samples_per_second': 7.217, 'train_steps_per_second': 0.904, 'train_loss': 0.009629794311893565, 'epoch': 3.0}


TrainOutput(global_step=1707, training_loss=0.009629794311893565, metrics={'train_runtime': 1889.1764, 'train_samples_per_second': 7.217, 'train_steps_per_second': 0.904, 'train_loss': 0.009629794311893565, 'epoch': 3.0})

In [10]:
model_save_path = os.path.join(CFG.target_dir, 'model')

In [12]:
trainer.save_model(model_save_path)

In [13]:
model = AutoModelForTokenClassification.from_pretrained(model_save_path)

In [14]:
trainer = Trainer(
    model=model,
    eval_dataset=data_test,
    compute_metrics=partial(compute_metrics, labels_list = CFG.LABELS_LIST),
)

#eval
trainer.evaluate()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/71 [00:00<?, ?it/s]

{'eval_loss': 0.0007727962802164257,
 'eval_recall': 0.998916756936922,
 'eval_precision': 0.9985839233652645,
 'eval_fbeta_score': 0.9989039515431208,
 'eval_runtime': 23.2797,
 'eval_samples_per_second': 24.141,
 'eval_steps_per_second': 3.05}

## Evaluating on train.json (original dataset)

In [14]:
kaggle_data_path = os.path.join('..','data', 'raw', 'train.json')
kaggle_data = get_dataset_from_path(kaggle_data_path)
kaggle_data = preprocess_data(kaggle_data, tokenizer, label2id = CFG.label2id, keys_to_flatten=keys_to_flatten)

encoding the labels...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

flattening the data...


100%|██████████| 6/6 [00:13<00:00,  2.18s/it]


In [15]:
kaggle_train, kaggle_val, kaggle_test = get_train_val_test_split(kaggle_data, seed=CFG.seed)

Evaluating on the standard test set

In [20]:
trainer.evaluate(kaggle_test)

  0%|          | 0/177 [00:00<?, ?it/s]

{'eval_loss': 0.01543649286031723,
 'eval_recall': 0.7372448979591837,
 'eval_precision': 0.25643300798580304,
 'eval_fbeta_score': 0.6876544339708976,
 'eval_runtime': 54.6587,
 'eval_samples_per_second': 25.851,
 'eval_steps_per_second': 3.238}

Evaluating on the whole original dataset

In [21]:
trainer.evaluate(kaggle_data)

  0%|          | 0/1766 [00:00<?, ?it/s]

{'eval_loss': 0.015895720571279526,
 'eval_recall': 0.7844444444444445,
 'eval_precision': 0.31398075523570795,
 'eval_fbeta_score': 0.7417001550137016,
 'eval_runtime': 612.0197,
 'eval_samples_per_second': 23.076,
 'eval_steps_per_second': 2.886}

## Combining both datasets

In [16]:
kaggle_data = kaggle_data.map(lambda examples: {'document1': str(examples['document'])}, remove_columns=['document'])
kaggle_data = kaggle_data.rename_column('document1', 'document')

Map:   0%|          | 0/14123 [00:00<?, ? examples/s]

In [17]:
kaggle_train, kaggle_val, kaggle_test = get_train_val_test_split(kaggle_data, seed=CFG.seed)

In [12]:
data_train.features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'org_word_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'document': Value(dtype='string', id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [18]:
combined_train_data = concatenate_datasets([data_train, kaggle_train])
combined_train_data.shuffle()
combined_val_data = concatenate_datasets([data_eval, kaggle_val])
combined_val_data.shuffle()
combined_test_data = concatenate_datasets([data_test, kaggle_test])
combined_test_data.shuffle()

Dataset({
    features: ['input_ids', 'attention_mask', 'org_word_ids', 'document', 'labels'],
    num_rows: 1975
})

Training the model on comined data

In [14]:
#reinitialize the model
model = AutoModelForTokenClassification.from_pretrained(
    CFG.model_name, num_labels=len(CFG.id2label), id2label=CFG.id2label, label2id=CFG.label2id
)

CFG.training_args.output_dir = os.path.join(CFG.target_dir, 'trainer_combined')

#train the model on the combined dataset
trainer = Trainer(
    model=model,
    args=CFG.training_args,
    train_dataset=combined_train_data,
    eval_dataset=combined_val_data,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

trainer.train()

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/5994 [00:00<?, ?it/s]

{'loss': 0.0232, 'learning_rate': 4.582916249582916e-05, 'epoch': 0.25}
{'loss': 0.0038, 'learning_rate': 4.165832499165833e-05, 'epoch': 0.5}
{'loss': 0.0022, 'learning_rate': 3.7487487487487486e-05, 'epoch': 0.75}


  0%|          | 0/222 [00:00<?, ?it/s]

{'eval_loss': 0.0012952466495335102, 'eval_recall': 0.9850746268656716, 'eval_precision': 0.9734513274336283, 'eval_fbeta_score': 0.9846224466375947, 'eval_runtime': 75.5417, 'eval_samples_per_second': 23.51, 'eval_steps_per_second': 2.939, 'epoch': 1.0}
{'loss': 0.0018, 'learning_rate': 3.331664998331665e-05, 'epoch': 1.0}
{'loss': 0.0012, 'learning_rate': 2.914581247914581e-05, 'epoch': 1.25}
{'loss': 0.0013, 'learning_rate': 2.4974974974974976e-05, 'epoch': 1.5}
{'loss': 0.0009, 'learning_rate': 2.080413747080414e-05, 'epoch': 1.75}


In [19]:
model_save_path = os.path.join(CFG.target_dir, 'model_combined')

In [None]:
trainer.save_model(model_save_path)

In [20]:
model = AutoModelForTokenClassification.from_pretrained(model_save_path)

trainer = Trainer(
    model=model,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Evaluating the model on the combined test set

In [21]:
trainer.evaluate(combined_test_data)

  0%|          | 0/247 [00:00<?, ?it/s]

{'eval_loss': 0.0008544130250811577,
 'eval_recall': 0.9930605987250868,
 'eval_precision': 0.9943443483881393,
 'eval_fbeta_score': 0.993109912415193,
 'eval_runtime': 77.9286,
 'eval_samples_per_second': 25.344,
 'eval_steps_per_second': 3.17}

In [22]:
trainer.evaluate(kaggle_test)

  0%|          | 0/177 [00:00<?, ?it/s]

{'eval_loss': 0.0009300351957790554,
 'eval_recall': 0.8112244897959183,
 'eval_precision': 0.8548387096774194,
 'eval_fbeta_score': 0.8128195045222177,
 'eval_runtime': 52.5997,
 'eval_samples_per_second': 26.863,
 'eval_steps_per_second': 3.365}