# pip installing libraries

In [2]:
!pip install transformers
!pip install datasets
!pip install pynvml
!pip install evaluate
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://u

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Utils for GPU

In [4]:
# ===========================================
# ||                                       ||
# ||       Section 1: Importing modules    ||
# ||                                       ||
# ===========================================
import torch
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo


# ===========================================
# ||                                       ||
# ||  Section 2: utils functions for gpu   ||
# ||             and device                ||
# ||                                       ||
# ===========================================

def check_gpu_availability():
    # Check if CUDA is available
    print(f"Cuda is available: {torch.cuda.is_available()}")


def getting_device(gpu_prefence=True) -> torch.device:
    """
    This function gets the torch device to be used for computations,
    based on the GPU preference specified by the user.
    """

    # If GPU is preferred and available, set device to CUDA
    if gpu_prefence and torch.cuda.is_available():
        device = torch.device('cuda')
    # If GPU is not preferred or not available, set device to CPU
    else:
        device = torch.device("cpu")

    # Print the selected device
    print(f"Selected device: {device}")

    # Return the device
    return device


# Define a function to print GPU memory utilization
def print_gpu_utilization():
    # Initialize the PyNVML library
    nvmlInit()
    # Get a handle to the first GPU in the system
    handle = nvmlDeviceGetHandleByIndex(0)
    # Get information about the memory usage on the GPU
    info = nvmlDeviceGetMemoryInfo(handle)
    # Print the GPU memory usage in MB
    print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.")


# Define a function to print training summary information
def print_summary(result):
    # Print the total training time in seconds
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    # Print the number of training samples processed per second
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    # Print the GPU memory utilization
    print_gpu_utilization()

def clean_gpu():
    # Get current GPU memory usage
    print("BEFORE CLEANING:")
    print(f"Allocated: {cuda.memory_allocated() / 1024 ** 3:.2f} GB")
    print(f"Cached: {cuda.memory_cached() / 1024 ** 3:.2f} GB")
    print("\n")
    # Free up PyTorch and CUDA memory
    torch.cuda.empty_cache()
    cuda.empty_cache()

    # Run garbage collection to free up other memory
    gc.collect()

    # Get new GPU memory usage
    print("AFTER CLEANING:")
    print(f"Allocated: {cuda.memory_allocated() / 1024 ** 3:.2f} GB")
    print(f"Cached: {cuda.memory_cached() / 1024 ** 3:.2f} GB")

# Import libraries

In [5]:


import transformers
from datasets import load_dataset, load_metric,  Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import os
import nltk
import torch
import evaluate
import sys
import pandas as pd
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import os
import torch.cuda as cuda
import gc
import optuna
#from utils4gpu import *

# Utils for Model

In [6]:

# getting the max length of the tokenized tweet
def getting_max_length(tokenizer, items):

  # initialize a list for lengths
  max_len = 0

  # iterate over the list
  for item in items:

    # record lenght of each item
    lenght_of_item = len(tokenizer.encode(item, add_special_tokens = True))

    if lenght_of_item > max_len:

      max_len = lenght_of_item

  return max_len


# Checking GPU, selecting model and device

In [7]:
# CHECK IF GPU IS UP
check_gpu_availability()

# SAVE THE DEVICE WE ARE WORKING WITH
device = getting_device(gpu_prefence=True)

# SHOULD BE FEW MB
print_gpu_utilization()

# SETTING HF CHECKPOINT/MODEL
model_nm = "distilbert-base-uncased"

Cuda is available: True
Selected device: cuda
GPU memory occupied: 261 MB.


# Defining dataframes

In [8]:
# Read csv files to create pandas dataframes
path2test = '/content/drive/MyDrive/GOOD DATA/clean_test_data.csv'
test_df = pd.read_csv(path2test)

path2val = '/content/drive/MyDrive/GOOD DATA/clean_validation_data.csv'
validation_df = pd.read_csv(path2val)

path2train = '/content/drive/MyDrive/GOOD DATA/augmented_cleaned_train_df.csv'
train_df = pd.read_csv(path2train)

# Renaming columns
train_df.rename(columns = {"target":"labels"}, inplace = True)
validation_df.rename(columns = {"target":"labels"}, inplace = True)
test_df.rename(columns = {"target":"labels"}, inplace = True)

# pandas2dataset
ds_train = Dataset.from_pandas(train_df)
ds_validation = Dataset.from_pandas(validation_df)
ds_test = Dataset.from_pandas(test_df)

# Tokenization, tensorization and collider

In [9]:
# ===========================================
# ||                                       ||
# ||Section 5: tokenization, tensorization ||
# ||              and collider             ||
# ||                                       ||
# ===========================================

# IMPORTING OUR TOKENIZER
tokz = AutoTokenizer.from_pretrained(model_nm)

# GETTING THE LENGHT MAX
max_len = getting_max_length(tokz,ds_train["text"])

# DEFINING A TOKENIZE FUNCTION TO TOKENIZE BOTH THE TWO DATASETS
def tok_func(x): return tokz(x["text"], truncation=True, padding = "max_length", max_length=max_len)

# TOKENIZING THE DS
tok_ds_train = ds_train.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])
tok_ds_validation = ds_validation.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])
tok_ds_test = ds_test.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])

# CREATE A DATASET TO FEED THE MODEL
ds = DatasetDict({"train":tok_ds_train,
                  "validation":tok_ds_validation,
             "test": tok_ds_test})

# GETTING THE COLLATOR
data_collator = DataCollatorWithPadding(tokenizer=tokz)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1218 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

# Metrics

In [10]:
# 1) F1 and ACCURACY

# now that we have our callable object, we define a function that the trainer can use to compute its metric => we cannot call directly metrics.compute because the output
# of the model is not a prediction but a logist
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Hyperparameter tuning

In [11]:

num_r = 1

# Define the search space for hyperparameters using Optuna's distributions.
def objective(trial):
    global num_r

    # Rename folder containing the old model and files
    if os.path.exists("/content/distilbert-base-uncased"):
      os.rename("/content/distilbert-base-uncased", os.path.join(os.path.dirname("/content/distilbert-base-uncased"), str(num_r)))
    num_r += 1

    # IMPORTING THE MODEL
    model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels = 2).to(device)
    # checking if the model is on the gpu
    print_gpu_utilization()

    # Hyperparameters to optimize
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
    num_train_epochs = trial.suggest_int('num_train_epochs', 1, 5)

    # setting the hyperparameter for the trainer
    training_args = TrainingArguments(
        model_nm,
        evaluation_strategy = "epoch",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_steps = 50, # FROM BELOW MEMORY TRICKS
        gradient_accumulation_steps=16, # adding them to offset small batch size due to memory problem => so 2*8 => 16 batch-size traning
        fp16 = True
    )

    # passing in the hyperparameter for the trainer
    trainer = Trainer(
        model = model, # our model
        args = training_args, # hyperparameter defined before
        train_dataset = ds["train"],
        eval_dataset = ds["validation"],
        compute_metrics = compute_metrics, # evaluation function defined before
        data_collator = data_collator,
    )

    # TRAINING LOOP
    print(" ")
    print("START TRAINING")
    print(" ")
    trainer.train()
    print("DONE TRAINING")

    # TESTING
    print(" ")
    print("START VALIDATION")
    print(" ")
    predictions = trainer.predict(ds["validation"])
    eval_result = compute_metrics(predictions)
    print(eval_result)
    print("DONE VALIDATION")

    # Return the evaluation metric to be optimized by Optuna.
    return 1 - eval_result['f1']

# Define the Optuna study and run the hyperparameter search.
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)
print(study.best_params)
best_trial = study.best_trial

# Train the model with the best hyperparameters found by optuna and evaluate it on the test data.
best_num_train_epochs = best_trial.params['num_train_epochs']
best_weight_decay = best_trial.params['weight_decay']
best_learning_rate = best_trial.params['learning_rate']


[32m[I 2023-03-30 09:07:42,891][0m A new study created in memory with name: no-name-ec2158fb-06f0-4244-9bfb-aadd64a741ad[0m


Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 1355 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.616907,0.753695,0.736131
1,0.641300,0.547528,0.778325,0.763674
2,0.532300,0.512546,0.789819,0.777782
3,0.483400,0.508817,0.788998,0.776208


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:10:26,669][0m Trial 0 finished with value: 0.22379200171533065 and parameters: {'learning_rate': 4.104119512330384e-06, 'weight_decay': 1.2875025036361657e-05, 'num_train_epochs': 4}. Best is trial 0 with value: 0.22379200171533065.[0m


{'accuracy': 0.7889983579638752, 'f1': 0.7762079982846694}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2661 MB.
 
START TRAINING
 




Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.389691,0.844007,0.842831
1,0.435400,0.393228,0.840722,0.838462
2,0.284900,0.431007,0.831691,0.830587
3,0.203100,0.455135,0.83087,0.829793


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:12:47,757][0m Trial 1 finished with value: 0.17020720597770256 and parameters: {'learning_rate': 6.317452705573063e-05, 'weight_decay': 4.9024908093050954e-05, 'num_train_epochs': 4}. Best is trial 1 with value: 0.17020720597770256.[0m


{'accuracy': 0.8308702791461412, 'f1': 0.8297927940222974}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2661 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.449046,0.815271,0.814198
1,0.549100,0.409182,0.829228,0.826548
2,0.403700,0.399291,0.831691,0.829967
3,0.366200,0.396728,0.831691,0.829967


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:15:02,660][0m Trial 2 finished with value: 0.17003294663126634 and parameters: {'learning_rate': 1.2235837133550431e-05, 'weight_decay': 0.0023775833999494893, 'num_train_epochs': 4}. Best is trial 2 with value: 0.17003294663126634.[0m


{'accuracy': 0.8316912972085386, 'f1': 0.8299670533687337}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2661 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.6722,0.737274,0.720177


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:15:41,832][0m Trial 3 finished with value: 0.2798234662106649 and parameters: {'learning_rate': 4.195102047207905e-06, 'weight_decay': 5.446965306161919e-05, 'num_train_epochs': 1}. Best is trial 2 with value: 0.17003294663126634.[0m


{'accuracy': 0.7372742200328407, 'f1': 0.7201765337893351}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2661 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.658341,0.630542,0.534911
1,0.666800,0.635604,0.744663,0.718373
2,0.634600,0.612343,0.768473,0.753639
3,0.603900,0.602898,0.77422,0.761606


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:17:54,221][0m Trial 4 finished with value: 0.23839362318061952 and parameters: {'learning_rate': 1.843022609914242e-06, 'weight_decay': 6.4053352532769884e-06, 'num_train_epochs': 4}. Best is trial 2 with value: 0.17003294663126634.[0m


{'accuracy': 0.7742200328407225, 'f1': 0.7616063768193805}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.422671,0.820197,0.819976
1,0.504900,0.395702,0.834975,0.832171
2,0.379300,0.391273,0.83908,0.837397
3,0.334900,0.391106,0.83908,0.837605


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:20:09,387][0m Trial 5 finished with value: 0.16239529853308243 and parameters: {'learning_rate': 1.8486927411044538e-05, 'weight_decay': 2.0923346750670213e-05, 'num_train_epochs': 4}. Best is trial 5 with value: 0.16239529853308243.[0m


{'accuracy': 0.8390804597701149, 'f1': 0.8376047014669176}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.413205,0.819376,0.819076
1,0.468900,0.392522,0.837438,0.835592


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:21:17,316][0m Trial 6 finished with value: 0.16440788352179592 and parameters: {'learning_rate': 3.214622570739394e-05, 'weight_decay': 6.613572797517527e-06, 'num_train_epochs': 2}. Best is trial 5 with value: 0.16239529853308243.[0m


{'accuracy': 0.8374384236453202, 'f1': 0.8355921164782041}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.416596,0.819376,0.818156
1,0.484500,0.390525,0.833333,0.831909
2,0.364200,0.393761,0.843186,0.84151
3,0.311900,0.394396,0.840722,0.839056


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:23:25,633][0m Trial 7 finished with value: 0.16094418197254912 and parameters: {'learning_rate': 2.6675414031743722e-05, 'weight_decay': 2.7516452256746474e-06, 'num_train_epochs': 4}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8407224958949097, 'f1': 0.8390558180274509}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.401381,0.834154,0.832633
1,0.462500,0.387172,0.844828,0.843305
2,0.344000,0.396691,0.842365,0.840852
3,0.282800,0.403416,0.835796,0.834494


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:25:38,881][0m Trial 8 finished with value: 0.16550635939448632 and parameters: {'learning_rate': 3.419184076754031e-05, 'weight_decay': 7.642421124630331e-06, 'num_train_epochs': 4}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8357963875205254, 'f1': 0.8344936406055137}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.419911,0.825123,0.824807
1,0.496700,0.389362,0.834154,0.831962
2,0.371100,0.392226,0.840722,0.839328
3,0.318000,0.395765,0.838259,0.83732
4,0.318000,0.399935,0.838259,0.837136


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:28:20,187][0m Trial 9 finished with value: 0.16286390504752446 and parameters: {'learning_rate': 1.9913975235719384e-05, 'weight_decay': 0.0005451588595776122, 'num_train_epochs': 5}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8382594417077176, 'f1': 0.8371360949524755}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.401216,0.829228,0.828624
1,0.452100,0.386654,0.83908,0.836555


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:29:29,667][0m Trial 10 finished with value: 0.16344520165397103 and parameters: {'learning_rate': 5.440115899547728e-05, 'weight_decay': 1.2709396274345127e-06, 'num_train_epochs': 2}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8390804597701149, 'f1': 0.836554798346029}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.449095,0.813629,0.81312
1,0.534900,0.406127,0.830049,0.827338
2,0.399500,0.395737,0.836617,0.834339
3,0.351700,0.393813,0.83908,0.837605
4,0.351700,0.393554,0.839901,0.838191


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:31:52,862][0m Trial 11 finished with value: 0.16180917788899352 and parameters: {'learning_rate': 1.436705391036167e-05, 'weight_decay': 1.5915601343726021e-06, 'num_train_epochs': 5}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8399014778325123, 'f1': 0.8381908221110065}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.494987,0.804598,0.802289
1,0.570800,0.42863,0.820197,0.816025
2,0.424500,0.409566,0.83087,0.828553
3,0.382900,0.40332,0.829228,0.827052
4,0.382900,0.401965,0.830049,0.828002


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:33:37,506][0m Trial 12 finished with value: 0.17199769726757264 and parameters: {'learning_rate': 8.917141929384304e-06, 'weight_decay': 1.4868358845437262e-06, 'num_train_epochs': 5}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8300492610837439, 'f1': 0.8280023027324274}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.381266,0.844007,0.842164
1,0.438100,0.406632,0.834154,0.832839
2,0.266700,0.432105,0.832512,0.831631
3,0.169700,0.532384,0.834154,0.833779
4,0.169700,0.572082,0.824302,0.823958


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:35:21,720][0m Trial 13 finished with value: 0.17604226590423155 and parameters: {'learning_rate': 9.125878015498559e-05, 'weight_decay': 1.4072689115582125e-06, 'num_train_epochs': 5}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8243021346469622, 'f1': 0.8239577340957684}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.50511,0.795567,0.792615
1,0.575500,0.437197,0.816913,0.813506
2,0.438700,0.426331,0.821018,0.818652


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:36:24,933][0m Trial 14 finished with value: 0.18134826649593028 and parameters: {'learning_rate': 9.11997083983931e-06, 'weight_decay': 0.00023620087950740275, 'num_train_epochs': 3}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8210180623973727, 'f1': 0.8186517335040697}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.417138,0.816092,0.815619
1,0.476800,0.386777,0.832512,0.830976
2,0.359800,0.391908,0.836617,0.835483


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:37:27,926][0m Trial 15 finished with value: 0.16451734570790544 and parameters: {'learning_rate': 2.9594651084744832e-05, 'weight_decay': 3.903194250490116e-06, 'num_train_epochs': 3}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8366174055829229, 'f1': 0.8354826542920946}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.426594,0.818555,0.817883
1,0.506500,0.396061,0.828407,0.827281
2,0.381000,0.392175,0.833333,0.832111
3,0.330500,0.393619,0.838259,0.837437
4,0.330500,0.396453,0.838259,0.83732


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:39:12,538][0m Trial 16 finished with value: 0.16268025477318104 and parameters: {'learning_rate': 1.8195061365390425e-05, 'weight_decay': 2.890880088475768e-06, 'num_train_epochs': 5}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8382594417077176, 'f1': 0.837319745226819}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.399148,0.83908,0.837179
1,0.454800,0.388438,0.83908,0.838055
2,0.325500,0.408369,0.839901,0.83879
3,0.239100,0.456823,0.82266,0.822312
4,0.239100,0.474951,0.824302,0.82385


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:40:56,245][0m Trial 17 finished with value: 0.17614994578444043 and parameters: {'learning_rate': 4.632396262741163e-05, 'weight_decay': 2.0466128325143572e-05, 'num_train_epochs': 5}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8243021346469622, 'f1': 0.8238500542155596}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.380618,0.842365,0.840715
1,0.436700,0.400692,0.843186,0.84144
2,0.270400,0.437689,0.835796,0.834873


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:41:59,710][0m Trial 18 finished with value: 0.1651274201877454 and parameters: {'learning_rate': 9.690686036425781e-05, 'weight_decay': 1.1353971852861822e-06, 'num_train_epochs': 3}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.8357963875205254, 'f1': 0.8348725798122546}
DONE VALIDATION


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.
 
START TRAINING
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.68332,0.651067,0.649891
1,0.687300,0.677819,0.669951,0.64946


DONE TRAINING
 
START VALIDATION
 


[32m[I 2023-03-30 09:42:44,628][0m Trial 19 finished with value: 0.3505402037304647 and parameters: {'learning_rate': 1.048633392020499e-06, 'weight_decay': 3.3029056905903962e-06, 'num_train_epochs': 2}. Best is trial 7 with value: 0.16094418197254912.[0m


{'accuracy': 0.6699507389162561, 'f1': 0.6494597962695353}
DONE VALIDATION
{'learning_rate': 2.6675414031743722e-05, 'weight_decay': 2.7516452256746474e-06, 'num_train_epochs': 4}


# Model and Hyperparams



In [12]:

# Rename folder containing the old model
if os.path.exists("/content/distilbert-base-uncased"):
    os.rename("/content/distilbert-base-uncased", os.path.join(os.path.dirname("/content/distilbert-base-uncased"), '1'))

# IMPORTING THE MODEL
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels = 2).to(device)
# checking if the model is on the gpu
print_gpu_utilization()

# setting the hyperparameter for the trainer
training_args = TrainingArguments(
    model_nm,
    evaluation_strategy = "epoch",
    learning_rate=best_learning_rate,
    weight_decay=best_weight_decay,
    num_train_epochs=best_num_train_epochs,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps = 50, # FROM BELOW MEMORY TRICKS
    gradient_accumulation_steps=16, # adding them to offset small batch size due to memory problem => so 2*8 => 16 batch-size traning
    fp16 = True
    )

# passing in the hyperparameter for the trainer
trainer = Trainer(
    model = model, # our model
    args = training_args, # hyperparameter defined before
    train_dataset = ds["train"],
    eval_dataset = ds["validation"],
    compute_metrics = compute_metrics, # evaluation function defined before
    data_collator = data_collator,
)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

GPU memory occupied: 2663 MB.


# Training and testing the model

In [13]:
# TRAINING LOOP
print(" ")
print("START TRAINING ")
print(" ")
trainer.train()
print("DONE TRAINING")

# TESTING
print(" ")
print("\033[32mSTARTING FINAL TESTING OF THE MODEL\033[0m")
print(" ")
predictions = trainer.predict(ds["test"])
eval_result = compute_metrics(predictions)
print(eval_result)
print("DONE TESTING")



 
START TRAINING 
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.416596,0.819376,0.818156
1,0.484500,0.390525,0.833333,0.831909
2,0.364200,0.393761,0.843186,0.84151
3,0.311900,0.394396,0.840722,0.839056


DONE TRAINING
 
[32mSTARTING FINAL TESTING OF THE MODEL[0m
 


{'accuracy': 0.8305975049244911, 'f1': 0.8295095463285094}
DONE TESTING


# Variation and bias

In [14]:
from datasets import concatenate_datasets
from sklearn.model_selection import KFold

ds = concatenate_datasets([tok_ds_train, tok_ds_validation, tok_ds_test])

n=5
kf = KFold(n_splits=n, random_state=42, shuffle=True)

accuracy = []
f1 = []
set1 = train_df
set1.rename(columns = {"target":"labels"}, inplace = True)
i = 0
for train_index, val_index in kf.split(set1):
  i+=231
  if os.path.exists("/content/distilbert-base-uncased"):
     os.rename("/content/distilbert-base-uncased", os.path.join(os.path.dirname("/content/distilbert-base-uncased"), str(i)))
  # splitting Dataframe (dataset not included)
  train_df = set1.iloc[train_index]
  val_df = set1.iloc[val_index]
  ds_train = Dataset.from_pandas(train_df)
  ds_validation = Dataset.from_pandas(val_df)
  tok_ds_train = ds_train.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])
  tok_ds_validation = ds_validation.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])
  ds = DatasetDict({"train":tok_ds_train, "validation":tok_ds_validation})

  # cleaning gpu and loading the model
  clean_gpu()
  model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels = 2).to(device)
  # setting up the trainer
  trainer = Trainer(model = model, args = training_args, train_dataset = ds["train"], eval_dataset = ds["validation"], compute_metrics = compute_metrics, data_collator = data_collator)
  # train the model
  trainer.train()
  # access the performance
  eval_accuracy = trainer.evaluate(ds["validation"])['eval_accuracy']
  eval_f1 = trainer.evaluate(ds["validation"])['eval_f1']
  # append model score
  f1.append(eval_f1)
  accuracy.append(eval_accuracy)

Map:   0%|          | 0/3897 [00:00<?, ? examples/s]

Map:   0%|          | 0/975 [00:00<?, ? examples/s]

BEFORE CLEANING:
Allocated: 1.00 GB
Cached: 1.34 GB






AFTER CLEANING:
Allocated: 1.00 GB
Cached: 1.08 GB


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.426569,0.820513,0.819992
2,0.493900,0.403528,0.822564,0.819992
2,0.493900,0.40775,0.825641,0.823916
3,0.340600,0.410423,0.825641,0.824088


Map:   0%|          | 0/3897 [00:00<?, ? examples/s]

Map:   0%|          | 0/975 [00:00<?, ? examples/s]

BEFORE CLEANING:
Allocated: 1.00 GB
Cached: 1.85 GB


AFTER CLEANING:
Allocated: 1.00 GB
Cached: 1.09 GB


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.424306,0.818462,0.818188
2,0.464900,0.415442,0.82359,0.823545
2,0.464900,0.414216,0.835897,0.834087
3,0.337200,0.417762,0.829744,0.828935


Map:   0%|          | 0/3898 [00:00<?, ? examples/s]

Map:   0%|          | 0/974 [00:00<?, ? examples/s]

BEFORE CLEANING:
Allocated: 1.00 GB
Cached: 1.85 GB






AFTER CLEANING:
Allocated: 1.00 GB
Cached: 1.09 GB


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.444155,0.799795,0.800589
2,0.466300,0.40277,0.821355,0.819487
2,0.466300,0.411615,0.814168,0.814343
3,0.344700,0.413576,0.818275,0.818351


Map:   0%|          | 0/3898 [00:00<?, ? examples/s]

Map:   0%|          | 0/974 [00:00<?, ? examples/s]

BEFORE CLEANING:
Allocated: 1.01 GB
Cached: 1.38 GB






AFTER CLEANING:
Allocated: 1.01 GB
Cached: 1.10 GB


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.419743,0.810062,0.808543
2,0.483300,0.389233,0.829569,0.828246
2,0.483300,0.38938,0.834702,0.833302
3,0.340100,0.390736,0.835729,0.834217


Map:   0%|          | 0/3898 [00:00<?, ? examples/s]

Map:   0%|          | 0/974 [00:00<?, ? examples/s]

BEFORE CLEANING:
Allocated: 1.01 GB
Cached: 1.39 GB






AFTER CLEANING:
Allocated: 1.01 GB
Cached: 1.09 GB


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.43554,0.806982,0.806061
2,0.465700,0.418385,0.818275,0.816741
2,0.465700,0.434145,0.816222,0.814952
3,0.334800,0.438728,0.810062,0.809678


# Saving the model

In [15]:
# ===========================================
# ||                                       ||
# ||Section 11: saving the model           ||
# ||                                       ||
# ===========================================

import os
# Set the output directory
output_dir = '/content/output/DISTILBERT'

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the model and tokenizer to the output directory
trainer.save_model(output_dir)
tokz.save_pretrained(output_dir)

('/content/output/DISTILBERT/tokenizer_config.json',
 '/content/output/DISTILBERT/special_tokens_map.json',
 '/content/output/DISTILBERT/vocab.txt',
 '/content/output/DISTILBERT/added_tokens.json',
 '/content/output/DISTILBERT/tokenizer.json')