# pip installing libraries

In [1]:
!pip install transformers
!pip install datasets
!pip install pynvml
!pip install evaluate
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3
Looking in indexes: https://pypi.org/simple, http

# Utils for GPU

In [2]:
# ===========================================
# ||                                       ||
# ||       Section 1: Importing modules    ||
# ||                                       ||
# ===========================================
import torch
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo


# ===========================================
# ||                                       ||
# ||  Section 2: utils functions for gpu   ||
# ||             and device                ||
# ||                                       ||
# ===========================================

def check_gpu_availability():
    # Check if CUDA is available
    print(f"Cuda is available: {torch.cuda.is_available()}")


def getting_device(gpu_prefence=True) -> torch.device:
    """
    This function gets the torch device to be used for computations,
    based on the GPU preference specified by the user.
    """

    # If GPU is preferred and available, set device to CUDA
    if gpu_prefence and torch.cuda.is_available():
        device = torch.device('cuda')
    # If GPU is not preferred or not available, set device to CPU
    else:
        device = torch.device("cpu")

    # Print the selected device
    print(f"Selected device: {device}")

    # Return the device
    return device


# Define a function to print GPU memory utilization
def print_gpu_utilization():
    # Initialize the PyNVML library
    nvmlInit()
    # Get a handle to the first GPU in the system
    handle = nvmlDeviceGetHandleByIndex(0)
    # Get information about the memory usage on the GPU
    info = nvmlDeviceGetMemoryInfo(handle)
    # Print the GPU memory usage in MB
    print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.")


# Define a function to print training summary information
def print_summary(result):
    # Print the total training time in seconds
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    # Print the number of training samples processed per second
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    # Print the GPU memory utilization
    print_gpu_utilization()

def clean_gpu():
    # Get current GPU memory usage
    print("BEFORE CLEANING:")
    print(f"Allocated: {cuda.memory_allocated() / 1024 ** 3:.2f} GB")
    print(f"Cached: {cuda.memory_cached() / 1024 ** 3:.2f} GB")
    print("\n")
    # Free up PyTorch and CUDA memory
    torch.cuda.empty_cache()
    cuda.empty_cache()

    # Run garbage collection to free up other memory
    gc.collect()

    # Get new GPU memory usage
    print("AFTER CLEANING:")
    print(f"Allocated: {cuda.memory_allocated() / 1024 ** 3:.2f} GB")
    print(f"Cached: {cuda.memory_cached() / 1024 ** 3:.2f} GB")

# Import libraries

In [None]:
# ===========================================
# ||                                       ||
# ||Section 1: Importing modules           ||
# ||                                       ||
# ===========================================

import transformers
from datasets import load_dataset, load_metric,  Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import os
import nltk
import torch
import evaluate
import sys
import pandas as pd
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import os
import torch.cuda as cuda
import gc
import optuna
#from utils4gpu import *

# Utils for Model

In [None]:
# ===========================================
# ||                                       ||
# ||Section 2: Utlis 4 DistilBert          ||
# ||                                       ||
# ===========================================

# getting the max length of the tokenized tweet
def getting_max_length(tokenizer, items):

  # initialize a list for lengths
  max_len = 0

  # iterate over the list
  for item in items:

    # record lenght of each item
    lenght_of_item = len(tokenizer.encode(item, add_special_tokens = True))

    if lenght_of_item > max_len:

      max_len = lenght_of_item

  return max_len


# Checking GPU, selecting model and device

In [None]:
# ===========================================
# ||                                       ||
# ||Section 3: checking gpu, choosing      ||
# ||             device, and model         ||
# ||                                       ||
# ===========================================

# CHECK IF GPU IS UP
check_gpu_availability()

# SAVE THE DEVICE WE ARE WORKING WITH
device = getting_device(gpu_prefence=True)

# SHOULD BE FEW MB
print_gpu_utilization()

# SETTING HF CHECKPOINT/MODEL
model_nm = "distilbert-base-uncased"

# Defining dataframes

In [None]:
# ===========================================
# ||                                       ||
# ||Section 4: Importing doc and split     ||
# ||                                       ||
# ===========================================

# Read csv files to create pandas dataframes
path2test = '/content/drive/MyDrive/ML_proj/zaazazza/Copia de test_df.csv'
test_df = pd.read_csv(path2test)

path2val = '/content/drive/MyDrive/ML_proj/zaazazza/Copia de validation_df.csv'
validation_df = pd.read_csv(path2val)

path2train = '/content/drive/MyDrive/ML_proj/zaazazza/Copia de train_df.csv'
train_df = pd.read_csv(path2train)

# Renaming columns
train_df.rename(columns = {"target":"labels"}, inplace = True)
validation_df.rename(columns = {"target":"labels"}, inplace = True)
test_df.rename(columns = {"target":"labels"}, inplace = True)

# pandas2dataset
ds_train = Dataset.from_pandas(train_df)
ds_validation = Dataset.from_pandas(validation_df)
ds_test = Dataset.from_pandas(test_df)

# Tokenization, tensorization and collider

In [None]:
# ===========================================
# ||                                       ||
# ||Section 5: tokenization, tensorization ||
# ||              and collider             ||
# ||                                       ||
# ===========================================

# IMPORTING OUR TOKENIZER
tokz = AutoTokenizer.from_pretrained(model_nm)

# GETTING THE LENGHT MAX
max_len = getting_max_length(tokz,ds_train["text"])

# DEFINING A TOKENIZE FUNCTION TO TOKENIZE BOTH THE TWO DATASETS
def tok_func(x): return tokz(x["text"], truncation=True, padding = "max_length", max_length=max_len)

# TOKENIZING THE DS
tok_ds_train = ds_train.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])
tok_ds_validation = ds_validation.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])
tok_ds_test = ds_test.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])

# CREATE A DATASET TO FEED THE MODEL
ds = DatasetDict({"train":tok_ds_train,
                  "validation":tok_ds_validation,
             "test": tok_ds_test})

# GETTING THE COLLATOR
data_collator = DataCollatorWithPadding(tokenizer=tokz)

# Metrics

In [None]:
# ===========================================
# ||                                       ||
# ||Section 6: metrics                     ||
# ||                                       ||
# ===========================================

# 1) F1 and ACCURACY

# now that we have our callable object, we define a function that the trainer can use to compute its metric => we cannot call directly metrics.compute because the output
# of the model is not a prediction but a logist
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Hyperparameter tuning

In [None]:
# ===========================================
# ||                                       ||
# ||Section 7: hyperparameter search       ||
# ||                                       ||
# ===========================================
num_r = 1

# Define the search space for hyperparameters using Optuna's distributions.
def objective(trial):
    global num_r

    # Rename folder containing the old model and files
    if os.path.exists("/content/distilbert-base-uncased"):
      os.rename("/content/distilbert-base-uncased", os.path.join(os.path.dirname("/content/distilbert-base-uncased"), str(num_r)))
    num_r += 1

    # IMPORTING THE MODEL
    model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels = 2).to(device)
    # checking if the model is on the gpu
    print_gpu_utilization()

    # Hyperparameters to optimize
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-6, 1e-4)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
    num_train_epochs = trial.suggest_int('num_train_epochs', 1, 5)

    # setting the hyperparameter for the trainer
    training_args = TrainingArguments(
        model_nm,
        evaluation_strategy = "epoch",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_steps = 50, # FROM BELOW MEMORY TRICKS
        gradient_accumulation_steps=16, # adding them to offset small batch size due to memory problem => so 2*8 => 16 batch-size traning
        fp16 = True
    )

    # passing in the hyperparameter for the trainer
    trainer = Trainer(
        model = model, # our model
        args = training_args, # hyperparameter defined before
        train_dataset = ds["train"],
        eval_dataset = ds["validation"],
        compute_metrics = compute_metrics, # evaluation function defined before
        data_collator = data_collator,
    )

    # TRAINING LOOP
    print(" ")
    print("START TRAINING")
    print(" ")
    trainer.train()
    print("DONE TRAINING")

    # TESTING
    print(" ")
    print("START VALIDATION")
    print(" ")
    predictions = trainer.predict(ds["validation"])
    eval_result = compute_metrics(predictions)
    print(eval_result)
    print("DONE VALIDATION")

    # Return the evaluation metric to be optimized by Optuna.
    return 1 - eval_result['f1']

# Define the Optuna study and run the hyperparameter search.
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=2)
print(study.best_params)
best_trial = study.best_trial

# Train the model with the best hyperparameters found by optuna and evaluate it on the test data.
best_num_train_epochs = best_trial.params['num_train_epochs']
best_weight_decay = best_trial.params['weight_decay']
best_learning_rate = best_trial.params['learning_rate']


# Model

In [None]:
# ===========================================
# ||                                       ||
# ||Section 8: the model and hyperparam    ||
# ||                                       ||
# ===========================================

# Rename folder containing the old model
if os.path.exists("/content/distilbert-base-uncased"):
    os.rename("/content/distilbert-base-uncased", os.path.join(os.path.dirname("/content/distilbert-base-uncased"), '1'))

# IMPORTING THE MODEL
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels = 2).to(device)
# checking if the model is on the gpu
print_gpu_utilization()

# setting the hyperparameter for the trainer
training_args = TrainingArguments(
    model_nm,
    evaluation_strategy = "epoch",
    learning_rate=best_learning_rate,
    weight_decay=best_weight_decay,
    num_train_epochs=best_num_train_epochs,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps = 50, # FROM BELOW MEMORY TRICKS
    gradient_accumulation_steps=16, # adding them to offset small batch size due to memory problem => so 2*8 => 16 batch-size traning
    fp16 = True
    )

# passing in the hyperparameter for the trainer
trainer = Trainer(
    model = model, # our model
    args = training_args, # hyperparameter defined before
    train_dataset = ds["train"],
    eval_dataset = ds["validation"],
    compute_metrics = compute_metrics, # evaluation function defined before
    data_collator = data_collator,
)


# Training and testing the model

In [None]:
# ===========================================
# ||                                       ||
# ||Section 9: training and testing        ||
# ||                                       ||
# ===========================================

# TRAINING LOOP
print(" ")
print("START TRAINING ")
print(" ")
trainer.train()
print("DONE TRAINING")

# TESTING
print(" ")
print("\033[32mSTARTING FINAL TESTING OF THE MODEL\033[0m")
print(" ")
predictions = trainer.predict(ds["test"])
eval_result = compute_metrics(predictions)
print(eval_result)
print("DONE TESTING")

# Variation and bias

In [None]:
# ===========================================
# ||                                       ||
# ||Section 10: valriation and bias        ||
# ||                                       ||
# ===========================================

from datasets import concatenate_datasets
from sklearn.model_selection import KFold

ds = concatenate_datasets([tok_ds_train, tok_ds_validation, tok_ds_test])

n=5
kf = KFold(n_splits=n, random_state=42, shuffle=True)

accuracy = []
f1 = []
set1 = train_df
set1.rename(columns = {"target":"labels"}, inplace = True)
i = 0
for train_index, val_index in kf.split(set1):
  i+=231
  if os.path.exists("/content/distilbert-base-uncased"):
     os.rename("/content/distilbert-base-uncased", os.path.join(os.path.dirname("/content/distilbert-base-uncased"), str(i)))
  # splitting Dataframe (dataset not included)
  train_df = set1.iloc[train_index]
  val_df = set1.iloc[val_index]
  ds_train = Dataset.from_pandas(train_df)
  ds_validation = Dataset.from_pandas(val_df)
  tok_ds_train = ds_train.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])
  tok_ds_validation = ds_validation.map(tok_func, batched=True, remove_columns=['text','id', 'keyword', 'location'])
  ds = DatasetDict({"train":tok_ds_train, "validation":tok_ds_validation})

  # cleaning gpu and loading the model
  clean_gpu()
  model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels = 2).to(device)
  # setting up the trainer
  trainer = Trainer(model = model, args = training_args, train_dataset = ds["train"], eval_dataset = ds["validation"], compute_metrics = compute_metrics, data_collator = data_collator)
  # train the model
  trainer.train()
  # access the performance
  eval_accuracy = trainer.evaluate(ds["validation"])['eval_accuracy']
  eval_f1 = trainer.evaluate(ds["validation"])['eval_f1']
  # append model score
  f1.append(eval_f1)
  accuracy.append(eval_accuracy)

# Saving the model

In [3]:
# ===========================================
# ||                                       ||
# ||Section 11: saving the model           ||
# ||                                       ||
# ===========================================

import os
# Set the output directory
output_dir = '/content/output/DISTILBERT'

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the model and tokenizer to the output directory
trainer.save_model(output_dir)
tokz.save_pretrained(output_dir)

Cuda is available: True
Selected device: cuda
GPU memory occupied: 261 MB.


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1218 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

[32m[I 2023-03-25 10:05:10,188][0m A new study created in memory with name: no-name-d952e40a-e212-4315-bafc-40255b41d1fd[0m


Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

GPU memory occupied: 1355 MB.
 
START TRAINING 
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.39833,0.829228,0.828893
1,0.432100,0.385551,0.844007,0.841558


DONE TRAINING
 
START TESTING
 


[32m[I 2023-03-25 10:06:11,246][0m Trial 0 finished with value: 0.15599343185550085 and parameters: {'learning_rate': 9.735390759009397e-05, 'weight_decay': 0.000742216939692806, 'num_train_epochs': 2}. Best is trial 0 with value: 0.15599343185550085.[0m


{'accuracy': 0.8440065681444991, 'f1': 0.8415582228864569}
DONE TESTING


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

GPU memory occupied: 2715 MB.
 
START TRAINING 
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.618441,0.77422,0.767392
1,0.651400,0.572521,0.772578,0.770819


DONE TRAINING
 
START TESTING
 


[32m[I 2023-03-25 10:06:54,188][0m Trial 1 finished with value: 0.22742200328407225 and parameters: {'learning_rate': 5.010021915669521e-06, 'weight_decay': 9.50183632963282e-06, 'num_train_epochs': 2}. Best is trial 0 with value: 0.15599343185550085.[0m


{'accuracy': 0.7725779967159278, 'f1': 0.7708193481232505}
DONE TESTING
{'learning_rate': 9.735390759009397e-05, 'weight_decay': 0.000742216939692806, 'num_train_epochs': 2}


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

GPU memory occupied: 2717 MB.
 
START TRAINING 
 


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.383084,0.843186,0.840684
1,0.430300,0.381728,0.844007,0.842235


DONE TRAINING
 
START TESTING
 


{'accuracy': 0.835193696651346, 'f1': 0.8338039863012869}
DONE TESTING


Map:   0%|          | 0/3897 [00:00<?, ? examples/s]

Map:   0%|          | 0/975 [00:00<?, ? examples/s]

BEFORE CLEANING:
Allocated: 1.00 GB
Cached: 1.39 GB




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

AFTER CLEANING:
Allocated: 1.00 GB
Cached: 1.11 GB


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.407349,0.82359,0.822716
1,0.421100,0.408972,0.834872,0.832759


Map:   0%|          | 0/3897 [00:00<?, ? examples/s]

Map:   0%|          | 0/975 [00:00<?, ? examples/s]

BEFORE CLEANING:
Allocated: 1.00 GB
Cached: 1.60 GB






AFTER CLEANING:
Allocated: 1.00 GB
Cached: 1.09 GB


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.391088,0.831795,0.830654
1,0.415700,0.39894,0.833846,0.833413


Map:   0%|          | 0/3898 [00:00<?, ? examples/s]

Map:   0%|          | 0/974 [00:00<?, ? examples/s]

BEFORE CLEANING:
Allocated: 1.01 GB
Cached: 1.89 GB


AFTER CLEANING:
Allocated: 1.01 GB
Cached: 1.09 GB


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.426973,0.808008,0.808749
1,0.420700,0.411723,0.831622,0.831472


Map:   0%|          | 0/3898 [00:00<?, ? examples/s]

Map:   0%|          | 0/974 [00:00<?, ? examples/s]

BEFORE CLEANING:
Allocated: 1.01 GB
Cached: 1.89 GB


AFTER CLEANING:
Allocated: 1.01 GB
Cached: 1.09 GB


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.401921,0.834702,0.832796
1,0.430200,0.398584,0.831622,0.830469


Map:   0%|          | 0/3898 [00:00<?, ? examples/s]

Map:   0%|          | 0/974 [00:00<?, ? examples/s]

BEFORE CLEANING:
Allocated: 1.01 GB
Cached: 1.89 GB






AFTER CLEANING:
Allocated: 1.01 GB
Cached: 1.09 GB


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.405656,0.824435,0.822158
1,0.420900,0.442823,0.811088,0.811333


('/content/output/DISTILBERT/tokenizer_config.json',
 '/content/output/DISTILBERT/special_tokens_map.json',
 '/content/output/DISTILBERT/vocab.txt',
 '/content/output/DISTILBERT/added_tokens.json',
 '/content/output/DISTILBERT/tokenizer.json')