In [1]:
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
import torch
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments
from transformers import Trainer
import math
from torch.utils.data import DataLoader
from transformers import default_data_collator
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from data_swerick import create_dataset_swerick
from evaluation import evaluation_task
import preprocessing

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
def insert_random_mask(batch,data_collator):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [4]:
model_checkpoint = "KBLab/bert-base-swedish-cased"
model = preprocessing.create_model_MLM(model_checkpoint)

Some weights of the model checkpoint at KBLab/bert-base-swedish-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
tokenizer =preprocessing.create_tokenizer(model_checkpoint)

In [6]:
#datasest
data_files = {"train": "swerick_data_train.pkl", "test": "swerick_data_test.pkl"}
swerick_dataset = load_dataset("pandas",data_files=data_files)
print(swerick_dataset)


DatasetDict({
    train: Dataset({
        features: ['protocole', 'texte', '__index_level_0__'],
        num_rows: 104
    })
    test: Dataset({
        features: ['protocole', 'texte', '__index_level_0__'],
        num_rows: 26
    })
})


In [7]:
sample = swerick_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> protocle: {row['protocole']}'")
    print(f"'>>> Texte: {row['texte']}'")
    print(f"'>>> index: {row['__index_level_0__']}'")


'>>> protocle: 70'
'>>> Texte: RIKSDAGENS1956ANDRAKAMMARENNr717—22februariDebatterm.m.Fredagenden17februariSid.InterpellationavherrStenbergang.möjligheternaattstannasnabbgåendetågihändelseavstoppteckenvidbevakadejärnvägsöver-GÅDDAL1oomsejeraoyjesoda:om116IRSREBIRSRBENAFATSRARR:3Tisdagenden21februariInterpellationerav:fruSjöstrandang.rättfördem,somförtidspensioneratspågrundavsjukdomochinvaliditetattutnyttjabillighetsresornaförfolk-PENSlONÄLEI>.sendsieEdGKKkrMRcsRRsfVRSN6herrHagårdang.denuvarandetidernaförrusdrycksförsäljning..7herrEdströmang.vissamenligaföljderföraktiebolagochekonomiskaföreningar,sompågrundavmissuppfattningkommitatterläggaförlågtskattebeloppisambandmeddenår1955genomfördaskattehöjningen..........s..s..-soseorereretroera8Onsdagenden22februariSvarpåfrågorav:herrJohanssoniStockholmang.enbindandeförsäkrantilllöntagarnaomförhindrandeavallaeventuellalevnadskostnadsstegringarunderdentidrymd,som1956årsavtalsuppgörelseavser..10herrBraconierang.redogörelsefördeåtgärder,somvidtagi

In [8]:
tokenized_datasets =preprocessing.tokenize_dataset(swerick_dataset,tokenizer)

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 104
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 26
    })
})

In [9]:
chunk_size = 128

In [10]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 14388'
'>>> Review 1 length: 137372'
'>>> Review 2 length: 104194'


In [11]:
lm_datasets = preprocessing.grouping_dataset(tokenized_datasets,chunk_size)
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 57099
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 12421
    })
})

In [12]:
data_collator = preprocessing.data_collector_masking(tokenizer,0.15)

In [13]:
batch_size = 64
num_epochs=100
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
print(len(lm_datasets["train"]) // batch_size)
model_name = model_checkpoint.split("/")[-1]

trainer = preprocessing.create_trainer(model,model_name,batch_size,logging_steps,train_dataset=lm_datasets["train"],eval_dataset=lm_datasets["test"],data_collator=data_collator,tokenizer=tokenizer,num_epochs=100)

892


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
eval_results = trainer.evaluate()
print(eval_results)
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

trial with a manual implementation

In [15]:
print(lm_datasets)

lm_dataset_bis = lm_datasets.remove_columns(["word_ids","token_type_ids"])


eval_dataset = preprocessing.create_deterministic_eval_dataset(lm_dataset_bis["test"],data_collator)

print(eval_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 57099
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 12421
    })
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 12421
})


In [16]:
batch_size = 64
train_dataloader = preprocessing.create_dataloader(lm_dataset_bis["train"],batch_size,data_collator)
def to_device(batch):
    return {key: value.to(device) for key, value in batch.items()}


eval_dataloader = preprocessing.create_dataloader(eval_dataset,batch_size,default_data_collator)

print(device)
for batch in train_dataloader:
    batch = to_device(batch)

for batch in eval_dataloader:
    batch = to_device(batch)




print(eval_dataloader.dataset)

cuda
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 12421
})


In [None]:
print(train_dataloader.dataset)
print(eval_dataloader)

for batch in eval_dataloader:
    print(batch["input_ids"].device)
    break

In [None]:
print(len(train_dataloader))

In [None]:
def get_dataloader():
    train =DataLoader(
    lm_dataset_bis["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator)
    train = [inputs.to(device) for inputs in train_dataloader]
    return train


for step,batch in enumerate(get_dataloader()):
    print(
        tokenizer.decode(batch["input_ids"][0]))
    break

for step,batch in enumerate(get_dataloader()):
    print(
        tokenizer.decode(batch["input_ids"][0]))
    break



In [None]:
model_bis = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model_bis=model_bis.to(device)

In [None]:
model_bis.eval()

total_loss = 0.0  # Variable to accumulate total loss

for step, batch in enumerate(eval_dataloader):
    with torch.no_grad():
        outputs = model_bis(**batch)
    loss = outputs.loss
    total_loss += loss.item()   # Accumulate the batch loss

# Calculate the average loss
average_loss = total_loss / len(eval_dataloader)

print(f"Initial Loss: {average_loss}")

In [None]:

optimizer = AdamW(model_bis.parameters(), lr=1.3e-5)
print(optimizer)

In [None]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
progress_bar = tqdm(range(num_training_steps))

losses_train=[]
losses_test=[]
#train_dataloader = get_dataloader()
for epoch in range(num_train_epochs):
    # Training
    model_bis.train()
    print(next(model_bis.parameters()).device)
    print(epoch)
    params_before_optimization = [param.data.clone() for param in model_bis.parameters()]
    total_loss_train = 0.0 
    train_dataloader = get_dataloader()

    for batch in train_dataloader:
        outputs = model_bis(**batch)
        loss = outputs.loss
        total_loss_train += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        params_after_optimization = [param.data for param in model_bis.parameters()]
        parameters_changed = any((param_before != param_after).any() for param_before, param_after in zip(params_before_optimization, params_after_optimization))
        #if parameters_changed==True :
             # print(parameters_changed) 
        progress_bar.update(1)

    losses_train.append(total_loss_train/len(train_dataloader))
    print("losses_train",losses_train)

    # Evaluation
    model_bis.eval()
    losses=[]
    total_loss_eval=0.0
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model_bis(**batch)

        loss = outputs.loss
        losses.append(loss.repeat(batch_size))
        total_loss_eval +=loss.item()


    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
       perplexity = float("inf")

    losses_test.append(total_loss_eval/len(eval_dataloader))


    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    print("losses_test",losses_test)

print("epoch",num_train_epochs)
plt.plot(range(num_train_epochs),losses_train,label="train Loss")

plt.plot(range(num_train_epochs),losses_test,label="test Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

            

In [None]:
print(losses_train)
print(losses_test)

In [None]:
file_path = "finetuning_manual"
model_bis.save_pretrained(file_path)
tokenizer.save_pretrained(file_path)

In [None]:
import pickle

file_name = "losses.pkl"

with open(file_name, 'wb') as f:
    pickle.dump({'losses_train': losses_train, 'losses_test': losses_test}, f)


In [None]:
from transformers import pipeline
pipe = pipeline(task="fill-mask", model="./test_model")

In [17]:
model=AutoModelForMaskedLM.from_pretrained("./test_model")
model=model.to(device)

In [None]:
model.get_input_embeddings()

In [None]:
correct_predictions = 0
total_predictions =0
for step,batch in enumerate(train_dataloader):
    print(len(batch["input_ids"][0]))
    print(torch.sum(batch["labels"][0]!=-100))
    indices_tokens_masked = []
    for labels in batch["labels"]:
        indices_tokens_masked.append(torch.nonzero(labels !=-100).flatten().tolist())
    print(len(indices_tokens_masked[0]))
    output=model(**batch)
    predicted_token_ids = torch.argmax(output.logits,dim=-1)
    for i in range(batch_size):
        for j in indices_tokens_masked[i]:
            total_predictions +=1
            if batch["input_ids"][i][j] == predicted_token_ids[i][j]:
                correct_predictions += 1


    break
print("acuraccy :",correct_predictions/total_predictions)
        



In [None]:
for batch in tqdm(eval_dataloader, total=len(eval_dataloader)):
    print(batch.keys())
    break

In [18]:
from sklearn.metrics import accuracy_score, classification_report
loss, accuracy = 0.0, []
model.eval()
true_labels, pred_labels, misclassified_examples = [], [], []
for batch in tqdm(eval_dataloader, total=len(eval_dataloader)):
    input_ids = batch["input_ids"].to(device)
    input_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    output = model(input_ids,
                    token_type_ids=None, 
                    attention_mask=input_mask, 
                    labels=labels)
    loss += output.loss.item()
    preds_batch = torch.argmax(output.logits, dim=-1)
    batch_acc = torch.mean((preds_batch == labels).float())
    accuracy.append(batch_acc)
    true_labels.extend(labels.cpu().numpy())
    pred_labels.extend(preds_batch.cpu().numpy())



print("\nAccuracy:", accuracy_score(true_labels, pred_labels))
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=list(None)))

  0%|          | 0/195 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 23.67 GiB of which 53.00 MiB is free. Process 7123 has 11.29 GiB memory in use. Including non-PyTorch memory, this process has 11.97 GiB memory in use. Of the allocated memory 11.58 GiB is allocated by PyTorch, and 79.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
correct_predictions = 0
total_predictions = 0

# Boucle sur les données d'entraînement
for step, batch in enumerate(eval_dataloader):
    # Récupération des indices des tokens masqués
    indices_tokens_masked = torch.nonzero(batch["labels"] != -100, as_tuple=False)

    # Calcul des prédictions
    output = model(**batch)
    predicted_token_ids = torch.argmax(output.logits, dim=-1)

    # Calcul du nombre de prédictions correctes
    correct_predictions += torch.sum(
        torch.eq(batch["input_ids"][indices_tokens_masked[:, 0], indices_tokens_masked[:, 1]], 
                 predicted_token_ids[indices_tokens_masked[:, 0], indices_tokens_masked[:, 1]])
    ).item()
    
    # Mise à jour du nombre total de prédictions
    total_predictions += indices_tokens_masked.size(0)
    print(correct_predictions)
    

# Calcul de l'accuracy
accuracy = correct_predictions / total_predictions
print("Accuracy:", accuracy)

In [None]:
for batch in eval_dataloader:
    print(batch["input_ids"].device)
    break

print(model.device)

In [None]:

model_hugging_face = AutoModelForMaskedLM.from_pretrained("./finetuning_trainer")
model_hugging_face=model_hugging_face.to(device)
model_kb=AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model_kb=model_kb.to(device)
evaluation_task(model,eval_dataloader)

In [None]:
logging_steps = 892
training_args = TrainingArguments(
        output_dir=f"{model.config.name_or_path}-finetuned-imdb",
        per_device_eval_batch_size=64,
        logging_steps=logging_steps,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        fp16=True,
        num_train_epochs=100
        )


trainer = Trainer(
    model=model_kb,
    args=training_args,
    eval_dataset=eval_dataloader.dataset
)
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


In [None]:
batch_size = 64
num_epochs=100
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
print(len(lm_datasets["train"]) // batch_size)
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    logging_steps=logging_steps,
    logging_dir='./logs', 
    num_train_epochs=num_epochs
)

trainer = Trainer(
    model=model_kb,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
model.save_pretrained("test_model")
tokenizer.save_pretrained("test_model")

In [None]:
model_bis= model_bis.to(device)

for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model_bis(**batch)
            print(outputs)

        loss = outputs.loss
        break

print

In [None]:
outputs.keys()