In [None]:
import torch
from datasets import load_dataset
import math
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import pickle
import preprocessing

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
model_checkpoint = "KBLab/bert-base-swedish-cased"
model = preprocessing.create_model_MLM(model_checkpoint)
model= model.from_pretrained("finetuning_hugging-finetuned-imdb/checkpoint-259384")

In [None]:
tokenizer =preprocessing.create_tokenizer(model_checkpoint)

In [None]:
data_files = {"train": "swerick_data_random_train.pkl", "test": "swerick_data_random_test.pkl"}
swerick_dataset = load_dataset("pandas",data_files=data_files)
print(swerick_dataset)

In [None]:
tokenized_datasets =preprocessing.tokenize_dataset(swerick_dataset,tokenizer)

tokenized_datasets

In [None]:
chunk_size = 128

In [None]:
lm_datasets = preprocessing.grouping_dataset(tokenized_datasets,chunk_size)
lm_datasets

In [None]:
with open("lm_dataset.pkl","wb") as fichier:
       pickle.dump(lm_datasets,fichier)

In [None]:
with open("lm_dataset.pkl","rb") as fichier:
       lm_datasets=pickle.load(fichier)

lm_datasets

In [None]:
print(tokenizer.decode(lm_datasets["train"]["input_ids"][0]))

In [None]:
data_collator = preprocessing.data_collector_masking(tokenizer,0.15)

In [None]:
batch_size = 64
num_epochs=100
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
print(len(lm_datasets["train"]) // batch_size)
model_name = "finetuning_hugging"

trainer = preprocessing.create_trainer(model,model_name,batch_size,logging_steps,train_dataset=lm_datasets["train"],eval_dataset=lm_datasets["test"],data_collator=data_collator,tokenizer=tokenizer,num_epochs=100)

In [None]:
eval_results = trainer.evaluate()
print(eval_results)
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.train()

In [None]:
#file_path = "finetuning_hugging.pth"
#model.save_pretrained("finetuning_trainer_total")
#tokenizer.save_pretrained("finetuning_trainer_total")

In [None]:
# Get the training and validation losses,
print(type(trainer.state.log_history))
print(len(trainer.state.log_history))
train_losses=[]
test_losses=[]
for i in range(len(trainer.state.log_history)//2):
   train_losses.append(trainer.state.log_history[2*i]["loss"])
   test_losses.append(trainer.state.log_history[2*i+1]["eva_loss"])
#eval_losses = trainer.state.log_history[\"eval_loss\"]

#print(train_losses)

plt.plot(train_losses, label="Train Loss")
plt.plot(test_losses, label="Eval Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
file_name = "losses_hugging.pkl"
with open(file_name, 'wb') as f:
   # pickle.dump({'losses_train': train_losses, 'losses_test': test_losses}, f)