In [1]:
from datasets import Dataset, DatasetDict, load_dataset
import os
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)



In [2]:
ds = load_dataset('yelp_review_full')

In [3]:
ds['train'][0]

{'label': 4,
 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."}

In [4]:
ds['train'].features


{'label': ClassLabel(names=['1 star', '2 star', '3 stars', '4 stars', '5 stars'], id=None),
 'text': Value(dtype='string', id=None)}

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [6]:
import pandas as pd
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])


In [7]:
ds_train.head
ds_test.head

<bound method NDFrame.head of        label                                               text
0          0  I got 'new' tires from them and within two wee...
1          0  Don't waste your time.  We had two different p...
2          0  All I can say is the worst! We were the only 2...
3          0  I have been to this restaurant twice and was d...
4          0  Food was NOT GOOD at all! My husband & I ate h...
...      ...                                                ...
49995      0  Just wanted to write a review to chip in with ...
49996      4  Great ambience. Great drinks. Great food. I lo...
49997      3  I have been to the other Monks locations so I ...
49998      1  Don't go here.  I know you might want to try i...
49999      0  Buffet was recently open after renovation so m...

[50000 rows x 2 columns]>

In [29]:
#rename column label to labels
ds_train.rename(columns={'label':'labels'}, inplace=True)

In [30]:


# assign the splits
train = Dataset.from_pandas(ds_train)
test = Dataset.from_pandas(ds_test)
# reconstruct both datasets into a Dataset Dict object
new_ds = DatasetDict(
    {
        'train': train,
        'test': test
    }
)
# view the resulting dataset dict object
new_ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [31]:
new_ds_path = "../results/new_ds"
print("Saving DatasetDict (new_ds) to disk...")
new_ds.save_to_disk(new_ds_path)

Saving DatasetDict (new_ds) to disk...


Saving the dataset (0/1 shards):   0%|          | 0/650000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

In [32]:
# Define model and tokenizer
model_name = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to save the tokenized dataset
tokenized_dataset_path = "../results/tokenized_dataset"

# Check if tokenized dataset exists
if os.path.exists(tokenized_dataset_path):
    print("Loading tokenized dataset from disk...")
    tokenized_ds = DatasetDict.load_from_disk(tokenized_dataset_path)
else:
    print("Tokenizing dataset...")

    # Tokenization function
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

    # Assuming you already have the splits as DataFrames
    train = Dataset.from_pandas(ds_train)
    test = Dataset.from_pandas(ds_test)

    # Reconstruct the datasets into a DatasetDict object
    new_ds = DatasetDict({"train": train, "test": test})

    # Tokenize the dataset
    tokenized_ds = new_ds.map(tokenize_function, batched=True)

    # Remove unnecessary columns and format dataset
    tokenized_ds = tokenized_ds.remove_columns(["text"])
   
    tokenized_ds.set_format("torch")

    # Save tokenized dataset to disk
    tokenized_ds.save_to_disk(tokenized_dataset_path)
    print("Tokenized dataset saved to disk.")

Loading tokenized dataset from disk...


In [10]:
# Initialize data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",               # Directory to save results
    eval_strategy="epoch",         # Evaluate after every epoch
    save_strategy="epoch",               # Save model after every epoch
    learning_rate=2e-5,                  # Default learning rate (to be tuned)
    per_device_train_batch_size=16,      # Default batch size (to be tuned)
    per_device_eval_batch_size=16,       # Batch size for evaluation
    num_train_epochs=3,                  # Default number of epochs (to be tuned)
    weight_decay=0.01,                   # Regularization
    gradient_accumulation_steps=4,       # Accumulate gradients over 4 steps
    fp16=True,                           # Mixed-precision training
    logging_dir="./logs",                # Log directory
    logging_steps=10,                    # Log every 10 steps
    save_total_limit=2,                  # Limit saved checkpoints to 2
    disable_tqdm=False                   # Enable progress bar
)

In [12]:
# Define model initialization for hyperparameter search
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)


In [13]:
# Define hyperparameter search space
def hyperparameter_search(trial):
    return {
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
        "weight_decay": trial.suggest_uniform("weight_decay", 0.01, 0.1),
    }

In [17]:
# Initialize the Trainer
trainer = Trainer(
    model_init=model_init,  # For hyperparameter search
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=data_collator,  # Use the data collator for dynamic padding
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Perform hyperparameter search
print("Starting hyperparameter search...")
best_run = trainer.hyperparameter_search(
    n_trials=5,  # Reduce for quicker results
    direction="minimize",  # Minimize evaluation loss
    compute_objective=lambda metrics: metrics["eval_loss"],
    hp_space=lambda trial: hyperparameter_search(trial),
)

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_run.hyperparameters)

[I 2024-12-18 01:57:32,626] A new study created in memory with name: no-name-68d38268-dbd2-41e3-8e7f-e8d372e28c92
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-5),
  "weight_decay": trial.suggest_uniform("weight_decay", 0.01, 0.1),
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting hyperparameter search...


  0%|          | 0/40624 [00:00<?, ?it/s]

{'loss': 6.4667, 'grad_norm': 4.195240020751953, 'learning_rate': 2.744639372496024e-05, 'epoch': 0.0}
{'loss': 6.4117, 'grad_norm': 7.178688049316406, 'learning_rate': 2.7439635859759827e-05, 'epoch': 0.0}
{'loss': 6.1921, 'grad_norm': 10.30695915222168, 'learning_rate': 2.7432877994559412e-05, 'epoch': 0.0}
{'loss': 5.7433, 'grad_norm': 16.48085594177246, 'learning_rate': 2.7426120129358997e-05, 'epoch': 0.0}
{'loss': 5.3887, 'grad_norm': 26.727764129638672, 'learning_rate': 2.7420038050678627e-05, 'epoch': 0.0}
{'loss': 4.9027, 'grad_norm': inf, 'learning_rate': 2.7414631758518297e-05, 'epoch': 0.0}
{'loss': 4.9174, 'grad_norm': 35.01726150512695, 'learning_rate': 2.7407873893317882e-05, 'epoch': 0.0}
{'loss': 4.5049, 'grad_norm': 19.07581901550293, 'learning_rate': 2.7401116028117467e-05, 'epoch': 0.0}
{'loss': 4.7778, 'grad_norm': 29.106046676635742, 'learning_rate': 2.7394358162917055e-05, 'epoch': 0.0}
{'loss': 4.4462, 'grad_norm': 22.236589431762695, 'learning_rate': 2.73876002

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.749480128288269, 'eval_runtime': 26.0466, 'eval_samples_per_second': 1919.634, 'eval_steps_per_second': 119.977, 'epoch': 1.0}
{'loss': 2.5557, 'grad_norm': 16.722673416137695, 'learning_rate': 1.372860315464045e-05, 'epoch': 1.0}
{'loss': 2.8236, 'grad_norm': 18.9008731842041, 'learning_rate': 1.3721845289440038e-05, 'epoch': 1.0}
{'loss': 2.9013, 'grad_norm': 23.665725708007812, 'learning_rate': 1.3715087424239624e-05, 'epoch': 1.0}
{'loss': 2.8632, 'grad_norm': 17.052701950073242, 'learning_rate': 1.3708329559039209e-05, 'epoch': 1.0}
{'loss': 2.7015, 'grad_norm': 23.123241424560547, 'learning_rate': 1.3701571693838797e-05, 'epoch': 1.0}
{'loss': 3.0523, 'grad_norm': 21.87013816833496, 'learning_rate': 1.3694813828638384e-05, 'epoch': 1.0}
{'loss': 2.8337, 'grad_norm': 22.068317413330078, 'learning_rate': 1.3688055963437968e-05, 'epoch': 1.0}
{'loss': 2.8529, 'grad_norm': 16.936613082885742, 'learning_rate': 1.3681298098237555e-05, 'epoch': 1.0}
{'loss': 2.8131, 'gra

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7349283695220947, 'eval_runtime': 26.1357, 'eval_samples_per_second': 1913.09, 'eval_steps_per_second': 119.568, 'epoch': 2.0}


[I 2024-12-18 02:49:46,041] Trial 0 finished with value: 0.7349283695220947 and parameters: {'learning_rate': 2.7453151590160654e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.026147972761883034}. Best is trial 0 with value: 0.7349283695220947.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-5),
  "weight_decay": trial.suggest_uniform("weight_decay", 0.01, 0.1),
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 3133.1722, 'train_samples_per_second': 414.915, 'train_steps_per_second': 12.966, 'train_loss': 2.9958965458847397, 'epoch': 2.0}


  0%|          | 0/15234 [00:00<?, ?it/s]

{'loss': 6.4417, 'grad_norm': 3.226839780807495, 'learning_rate': 2.5780279603459225e-05, 'epoch': 0.0}
{'loss': 6.255, 'grad_norm': 5.888062953948975, 'learning_rate': 2.5763345631044974e-05, 'epoch': 0.0}
{'loss': 5.6512, 'grad_norm': 7.830087184906006, 'learning_rate': 2.574641165863072e-05, 'epoch': 0.01}
{'loss': 5.0467, 'grad_norm': 11.707928657531738, 'learning_rate': 2.573117108345789e-05, 'epoch': 0.01}
{'loss': 4.5929, 'grad_norm': 26.49519920349121, 'learning_rate': 2.5714237111043637e-05, 'epoch': 0.01}
{'loss': 4.4307, 'grad_norm': 14.813034057617188, 'learning_rate': 2.5698996535870808e-05, 'epoch': 0.01}
{'loss': 4.3643, 'grad_norm': 17.464025497436523, 'learning_rate': 2.5682062563456554e-05, 'epoch': 0.01}
{'loss': 4.2865, 'grad_norm': 27.721126556396484, 'learning_rate': 2.5665128591042304e-05, 'epoch': 0.02}
{'loss': 4.1154, 'grad_norm': 19.736249923706055, 'learning_rate': 2.564819461862805e-05, 'epoch': 0.02}
{'loss': 4.0224, 'grad_norm': 11.84207534790039, 'learni

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7529968619346619, 'eval_runtime': 26.0512, 'eval_samples_per_second': 1919.296, 'eval_steps_per_second': 119.956, 'epoch': 1.0}
{'loss': 2.6177, 'grad_norm': 14.150666236877441, 'learning_rate': 1.7201529178398502e-05, 'epoch': 1.0}
{'loss': 2.9368, 'grad_norm': 14.344388961791992, 'learning_rate': 1.7184595205984248e-05, 'epoch': 1.0}
{'loss': 2.9343, 'grad_norm': 16.740163803100586, 'learning_rate': 1.7167661233569994e-05, 'epoch': 1.0}
{'loss': 2.9558, 'grad_norm': 11.252341270446777, 'learning_rate': 1.7150727261155744e-05, 'epoch': 1.01}
{'loss': 2.8392, 'grad_norm': 14.752829551696777, 'learning_rate': 1.713379328874149e-05, 'epoch': 1.01}
{'loss': 2.8961, 'grad_norm': 14.084866523742676, 'learning_rate': 1.7116859316327236e-05, 'epoch': 1.01}
{'loss': 2.9166, 'grad_norm': 12.588260650634766, 'learning_rate': 1.7099925343912985e-05, 'epoch': 1.01}
{'loss': 2.8875, 'grad_norm': 14.811295509338379, 'learning_rate': 1.708299137149873e-05, 'epoch': 1.01}
{'loss': 2.92

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7416584491729736, 'eval_runtime': 26.0467, 'eval_samples_per_second': 1919.632, 'eval_steps_per_second': 119.977, 'epoch': 2.0}
{'loss': 2.4612, 'grad_norm': 11.124466896057129, 'learning_rate': 8.602457986440676e-06, 'epoch': 2.0}
{'loss': 2.6087, 'grad_norm': 10.700343132019043, 'learning_rate': 8.585524014026423e-06, 'epoch': 2.0}
{'loss': 2.5796, 'grad_norm': 11.043747901916504, 'learning_rate': 8.568590041612171e-06, 'epoch': 2.0}
{'loss': 2.6999, 'grad_norm': 12.550935745239258, 'learning_rate': 8.551656069197917e-06, 'epoch': 2.01}
{'loss': 2.6499, 'grad_norm': 11.8192777633667, 'learning_rate': 8.534722096783665e-06, 'epoch': 2.01}
{'loss': 2.6626, 'grad_norm': 13.287158966064453, 'learning_rate': 8.51778812436941e-06, 'epoch': 2.01}
{'loss': 2.6433, 'grad_norm': 14.27581787109375, 'learning_rate': 8.500854151955157e-06, 'epoch': 2.01}
{'loss': 2.5232, 'grad_norm': 12.664811134338379, 'learning_rate': 8.483920179540903e-06, 'epoch': 2.01}
{'loss': 2.7357, 'grad_

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7447477579116821, 'eval_runtime': 26.0435, 'eval_samples_per_second': 1919.869, 'eval_steps_per_second': 119.992, 'epoch': 3.0}


[I 2024-12-18 03:46:18,320] Trial 1 finished with value: 0.7447477579116821 and parameters: {'learning_rate': 2.579721357587348e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.0977724079835197}. Best is trial 0 with value: 0.7349283695220947.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-5),
  "weight_decay": trial.suggest_uniform("weight_decay", 0.01, 0.1),
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 3392.0254, 'train_samples_per_second': 574.878, 'train_steps_per_second': 4.491, 'train_loss': 2.9306170724110294, 'epoch': 3.0}


  0%|          | 0/15234 [00:00<?, ?it/s]

{'loss': 6.4674, 'grad_norm': 3.333341598510742, 'learning_rate': 1.010303666945315e-05, 'epoch': 0.0}
{'loss': 6.4131, 'grad_norm': 2.9642951488494873, 'learning_rate': 1.0096400413101697e-05, 'epoch': 0.0}
{'loss': 6.3546, 'grad_norm': 4.077013969421387, 'learning_rate': 1.0089764156750245e-05, 'epoch': 0.01}
{'loss': 6.2039, 'grad_norm': 5.476590156555176, 'learning_rate': 1.008312790039879e-05, 'epoch': 0.01}
{'loss': 5.7949, 'grad_norm': 8.045862197875977, 'learning_rate': 1.0076491644047335e-05, 'epoch': 0.01}
{'loss': 5.3318, 'grad_norm': 7.592185974121094, 'learning_rate': 1.0070519013331028e-05, 'epoch': 0.01}
{'loss': 5.0426, 'grad_norm': 9.931798934936523, 'learning_rate': 1.0063882756979575e-05, 'epoch': 0.01}
{'loss': 4.8407, 'grad_norm': 10.724242210388184, 'learning_rate': 1.005724650062812e-05, 'epoch': 0.02}
{'loss': 4.5872, 'grad_norm': 17.197935104370117, 'learning_rate': 1.0050610244276666e-05, 'epoch': 0.02}
{'loss': 4.4559, 'grad_norm': 10.498516082763672, 'learni

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7817343473434448, 'eval_runtime': 26.0484, 'eval_samples_per_second': 1919.5, 'eval_steps_per_second': 119.969, 'epoch': 1.0}
{'loss': 2.7526, 'grad_norm': 17.996774673461914, 'learning_rate': 6.740445576171549e-06, 'epoch': 1.0}
{'loss': 3.1274, 'grad_norm': 16.899744033813477, 'learning_rate': 6.733809319820094e-06, 'epoch': 1.0}
{'loss': 3.1403, 'grad_norm': 19.028156280517578, 'learning_rate': 6.727173063468641e-06, 'epoch': 1.0}
{'loss': 3.1404, 'grad_norm': 13.68153190612793, 'learning_rate': 6.7205368071171875e-06, 'epoch': 1.01}
{'loss': 3.0463, 'grad_norm': 15.240152359008789, 'learning_rate': 6.713900550765734e-06, 'epoch': 1.01}
{'loss': 3.0811, 'grad_norm': 17.49955177307129, 'learning_rate': 6.707264294414279e-06, 'epoch': 1.01}
{'loss': 3.0504, 'grad_norm': 14.551804542541504, 'learning_rate': 6.700628038062826e-06, 'epoch': 1.01}
{'loss': 3.0894, 'grad_norm': 20.04593276977539, 'learning_rate': 6.6939917817113725e-06, 'epoch': 1.01}
{'loss': 3.1063, 'grad

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7630245089530945, 'eval_runtime': 26.0521, 'eval_samples_per_second': 1919.228, 'eval_steps_per_second': 119.952, 'epoch': 2.0}
{'loss': 2.6463, 'grad_norm': 14.92687702178955, 'learning_rate': 3.371218226538492e-06, 'epoch': 2.0}
{'loss': 2.8555, 'grad_norm': 14.415648460388184, 'learning_rate': 3.364581970187039e-06, 'epoch': 2.0}
{'loss': 2.8135, 'grad_norm': 13.619681358337402, 'learning_rate': 3.357945713835585e-06, 'epoch': 2.0}
{'loss': 2.9636, 'grad_norm': 15.508460998535156, 'learning_rate': 3.3513094574841313e-06, 'epoch': 2.01}
{'loss': 2.8882, 'grad_norm': 17.381980895996094, 'learning_rate': 3.3446732011326776e-06, 'epoch': 2.01}
{'loss': 2.874, 'grad_norm': 17.94887924194336, 'learning_rate': 3.338036944781224e-06, 'epoch': 2.01}
{'loss': 2.9589, 'grad_norm': 16.974979400634766, 'learning_rate': 3.33140068842977e-06, 'epoch': 2.01}
{'loss': 2.7794, 'grad_norm': 15.728072166442871, 'learning_rate': 3.3247644320783163e-06, 'epoch': 2.01}
{'loss': 2.9299, 'gr

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7560692429542542, 'eval_runtime': 26.0476, 'eval_samples_per_second': 1919.566, 'eval_steps_per_second': 119.973, 'epoch': 3.0}


[I 2024-12-18 04:43:25,172] Trial 2 finished with value: 0.7560692429542542 and parameters: {'learning_rate': 1.0109672925804605e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.01453663851841797}. Best is trial 0 with value: 0.7349283695220947.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-5),
  "weight_decay": trial.suggest_uniform("weight_decay", 0.01, 0.1),
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 3426.5529, 'train_samples_per_second': 569.085, 'train_steps_per_second': 4.446, 'train_loss': 3.138066788616183, 'epoch': 3.0}


  0%|          | 0/81248 [00:00<?, ?it/s]

{'loss': 6.4909, 'grad_norm': 4.459045886993408, 'learning_rate': 1.2454585436440294e-05, 'epoch': 0.0}
{'loss': 6.4482, 'grad_norm': 5.776541233062744, 'learning_rate': 1.2453052337959725e-05, 'epoch': 0.0}
{'loss': 6.3633, 'grad_norm': 7.362138271331787, 'learning_rate': 1.2451519239479158e-05, 'epoch': 0.0}
{'loss': 6.3221, 'grad_norm': 5.211766719818115, 'learning_rate': 1.244998614099859e-05, 'epoch': 0.0}
{'loss': 6.2366, 'grad_norm': 6.08654260635376, 'learning_rate': 1.244845304251802e-05, 'epoch': 0.0}
{'loss': 5.9581, 'grad_norm': 13.484685897827148, 'learning_rate': 1.2446919944037452e-05, 'epoch': 0.0}
{'loss': 5.8127, 'grad_norm': 17.4506778717041, 'learning_rate': 1.244554015540494e-05, 'epoch': 0.0}
{'loss': 5.3582, 'grad_norm': 14.792344093322754, 'learning_rate': 1.2444007056924374e-05, 'epoch': 0.0}
{'loss': 5.1247, 'grad_norm': 17.756420135498047, 'learning_rate': 1.2442627268291862e-05, 'epoch': 0.0}
{'loss': 4.8448, 'grad_norm': 17.38307762145996, 'learning_rate': 

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7593052983283997, 'eval_runtime': 26.0587, 'eval_samples_per_second': 1918.748, 'eval_steps_per_second': 119.922, 'epoch': 1.0}
{'loss': 2.6793, 'grad_norm': 23.104854583740234, 'learning_rate': 9.342548830734817e-06, 'epoch': 1.0}
{'loss': 2.9512, 'grad_norm': 24.207077026367188, 'learning_rate': 9.34101573225425e-06, 'epoch': 1.0}
{'loss': 3.004, 'grad_norm': 27.734569549560547, 'learning_rate': 9.339482633773682e-06, 'epoch': 1.0}
{'loss': 3.0472, 'grad_norm': 22.520002365112305, 'learning_rate': 9.337949535293113e-06, 'epoch': 1.0}
{'loss': 2.853, 'grad_norm': 23.700571060180664, 'learning_rate': 9.336416436812544e-06, 'epoch': 1.0}
{'loss': 3.1474, 'grad_norm': 21.05234146118164, 'learning_rate': 9.334883338331976e-06, 'epoch': 1.0}
{'loss': 2.9475, 'grad_norm': 26.004533767700195, 'learning_rate': 9.333350239851409e-06, 'epoch': 1.0}
{'loss': 2.9187, 'grad_norm': 17.669713973999023, 'learning_rate': 9.33181714137084e-06, 'epoch': 1.0}
{'loss': 2.8747, 'grad_norm':

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7411010265350342, 'eval_runtime': 26.0526, 'eval_samples_per_second': 1919.194, 'eval_steps_per_second': 119.95, 'epoch': 2.0}
{'loss': 2.563, 'grad_norm': 26.35799789428711, 'learning_rate': 6.2300522954851704e-06, 'epoch': 2.0}
{'loss': 2.7501, 'grad_norm': 27.56271743774414, 'learning_rate': 6.228519197004603e-06, 'epoch': 2.0}
{'loss': 2.7318, 'grad_norm': 20.456436157226562, 'learning_rate': 6.226986098524033e-06, 'epoch': 2.0}
{'loss': 2.5752, 'grad_norm': 19.6961669921875, 'learning_rate': 6.225453000043465e-06, 'epoch': 2.0}
{'loss': 2.4844, 'grad_norm': 15.68467903137207, 'learning_rate': 6.223919901562897e-06, 'epoch': 2.0}
{'loss': 2.715, 'grad_norm': 19.478559494018555, 'learning_rate': 6.222386803082329e-06, 'epoch': 2.0}
{'loss': 2.664, 'grad_norm': 27.342504501342773, 'learning_rate': 6.220853704601761e-06, 'epoch': 2.0}
{'loss': 2.6375, 'grad_norm': 19.910036087036133, 'learning_rate': 6.219320606121192e-06, 'epoch': 2.0}
{'loss': 2.5622, 'grad_norm': 19

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7510074973106384, 'eval_runtime': 26.0498, 'eval_samples_per_second': 1919.4, 'eval_steps_per_second': 119.963, 'epoch': 3.0}
{'loss': 2.5394, 'grad_norm': 22.779470443725586, 'learning_rate': 3.1175557602355226e-06, 'epoch': 3.0}
{'loss': 2.4077, 'grad_norm': 24.262981414794922, 'learning_rate': 3.1160226617549544e-06, 'epoch': 3.0}
{'loss': 2.5532, 'grad_norm': 24.482200622558594, 'learning_rate': 3.1144895632743857e-06, 'epoch': 3.0}
{'loss': 2.3108, 'grad_norm': 22.50092315673828, 'learning_rate': 3.112956464793818e-06, 'epoch': 3.0}
{'loss': 2.4259, 'grad_norm': 28.293716430664062, 'learning_rate': 3.1114233663132495e-06, 'epoch': 3.0}
{'loss': 2.3157, 'grad_norm': 28.022859573364258, 'learning_rate': 3.1098902678326813e-06, 'epoch': 3.0}
{'loss': 2.5411, 'grad_norm': 23.422000885009766, 'learning_rate': 3.1083571693521134e-06, 'epoch': 3.0}
{'loss': 2.6224, 'grad_norm': 35.036556243896484, 'learning_rate': 3.106824070871545e-06, 'epoch': 3.0}
{'loss': 2.3688, 'gra

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7667703628540039, 'eval_runtime': 26.0424, 'eval_samples_per_second': 1919.947, 'eval_steps_per_second': 119.997, 'epoch': 4.0}


[I 2024-12-18 06:28:04,823] Trial 3 finished with value: 0.7667703628540039 and parameters: {'learning_rate': 1.2456118534920863e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 4, 'weight_decay': 0.0793213161030783}. Best is trial 0 with value: 0.7349283695220947.
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 5e-5),
  "weight_decay": trial.suggest_uniform("weight_decay", 0.01, 0.1),
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 6279.3624, 'train_samples_per_second': 414.055, 'train_steps_per_second': 12.939, 'train_loss': 2.822665405700694, 'epoch': 4.0}


  0%|          | 0/81248 [00:00<?, ?it/s]

{'loss': 6.4692, 'grad_norm': 4.148629665374756, 'learning_rate': 2.5428993841916378e-05, 'epoch': 0.0}
{'loss': 6.4183, 'grad_norm': 7.062657356262207, 'learning_rate': 2.542586365729318e-05, 'epoch': 0.0}
{'loss': 6.2226, 'grad_norm': 8.98585319519043, 'learning_rate': 2.5422733472669985e-05, 'epoch': 0.0}
{'loss': 5.8275, 'grad_norm': 13.25234603881836, 'learning_rate': 2.5419603288046792e-05, 'epoch': 0.0}
{'loss': 5.4448, 'grad_norm': 17.301095962524414, 'learning_rate': 2.5416786121885915e-05, 'epoch': 0.0}
{'loss': 4.9427, 'grad_norm': 47.266170501708984, 'learning_rate': 2.541396895572504e-05, 'epoch': 0.0}
{'loss': 5.3825, 'grad_norm': 43.83895492553711, 'learning_rate': 2.5411151789564165e-05, 'epoch': 0.0}
{'loss': 4.9302, 'grad_norm': 39.65786361694336, 'learning_rate': 2.540802160494097e-05, 'epoch': 0.0}
{'loss': 4.7819, 'grad_norm': 30.128358840942383, 'learning_rate': 2.5404891420317772e-05, 'epoch': 0.0}
{'loss': 4.4144, 'grad_norm': 19.970502853393555, 'learning_rate'

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7559086084365845, 'eval_runtime': 26.0645, 'eval_samples_per_second': 1918.321, 'eval_steps_per_second': 119.895, 'epoch': 1.0}
{'loss': 2.5635, 'grad_norm': 17.58305549621582, 'learning_rate': 1.907471905682932e-05, 'epoch': 1.0}
{'loss': 2.9298, 'grad_norm': 21.182662963867188, 'learning_rate': 1.9071588872206125e-05, 'epoch': 1.0}
{'loss': 2.8906, 'grad_norm': 23.627910614013672, 'learning_rate': 1.906845868758293e-05, 'epoch': 1.0}
{'loss': 2.8736, 'grad_norm': 17.781612396240234, 'learning_rate': 1.9065328502959733e-05, 'epoch': 1.0}
{'loss': 2.732, 'grad_norm': 18.779003143310547, 'learning_rate': 1.9062198318336537e-05, 'epoch': 1.0}
{'loss': 3.0514, 'grad_norm': 17.772581100463867, 'learning_rate': 1.905906813371334e-05, 'epoch': 1.0}
{'loss': 2.7641, 'grad_norm': 21.45698356628418, 'learning_rate': 1.9055937949090147e-05, 'epoch': 1.0}
{'loss': 2.8643, 'grad_norm': 16.256107330322266, 'learning_rate': 1.905280776446695e-05, 'epoch': 1.0}
{'loss': 2.8368, 'grad_

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7415239214897156, 'eval_runtime': 26.0548, 'eval_samples_per_second': 1919.035, 'eval_steps_per_second': 119.94, 'epoch': 2.0}
{'loss': 2.397, 'grad_norm': 16.939075469970703, 'learning_rate': 1.2719818234817622e-05, 'epoch': 2.0}
{'loss': 2.537, 'grad_norm': 20.26837730407715, 'learning_rate': 1.2716688050194427e-05, 'epoch': 2.0}
{'loss': 2.4733, 'grad_norm': 18.034738540649414, 'learning_rate': 1.2713557865571231e-05, 'epoch': 2.0}
{'loss': 2.3513, 'grad_norm': 17.258319854736328, 'learning_rate': 1.2710427680948035e-05, 'epoch': 2.0}
{'loss': 2.2827, 'grad_norm': 13.551057815551758, 'learning_rate': 1.270729749632484e-05, 'epoch': 2.0}
{'loss': 2.64, 'grad_norm': 15.477913856506348, 'learning_rate': 1.2704167311701644e-05, 'epoch': 2.0}
{'loss': 2.6529, 'grad_norm': 23.35008430480957, 'learning_rate': 1.2701037127078448e-05, 'epoch': 2.0}
{'loss': 2.5183, 'grad_norm': 16.71102523803711, 'learning_rate': 1.2697906942455253e-05, 'epoch': 2.0}
{'loss': 2.4182, 'grad_no

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7664861679077148, 'eval_runtime': 26.0487, 'eval_samples_per_second': 1919.479, 'eval_steps_per_second': 119.967, 'epoch': 3.0}
{'loss': 2.5167, 'grad_norm': 26.120285034179688, 'learning_rate': 6.3646043943436045e-06, 'epoch': 3.0}
{'loss': 2.0994, 'grad_norm': 24.692306518554688, 'learning_rate': 6.361474209720409e-06, 'epoch': 3.0}
{'loss': 2.2528, 'grad_norm': 24.23455047607422, 'learning_rate': 6.358344025097214e-06, 'epoch': 3.0}
{'loss': 2.2338, 'grad_norm': 23.79096794128418, 'learning_rate': 6.355213840474017e-06, 'epoch': 3.0}
{'loss': 2.1664, 'grad_norm': 25.39236831665039, 'learning_rate': 6.352083655850822e-06, 'epoch': 3.0}
{'loss': 2.0069, 'grad_norm': 23.01955795288086, 'learning_rate': 6.3489534712276265e-06, 'epoch': 3.0}
{'loss': 2.2205, 'grad_norm': 17.304744720458984, 'learning_rate': 6.34582328660443e-06, 'epoch': 3.0}
{'loss': 2.3343, 'grad_norm': 32.650081634521484, 'learning_rate': 6.342693101981236e-06, 'epoch': 3.0}
{'loss': 1.976, 'grad_norm'

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.8196930289268494, 'eval_runtime': 26.0538, 'eval_samples_per_second': 1919.108, 'eval_steps_per_second': 119.944, 'epoch': 4.0}


[I 2024-12-18 08:12:59,935] Trial 4 finished with value: 0.8196930289268494 and parameters: {'learning_rate': 2.5432124026539574e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 4, 'weight_decay': 0.05829680099089378}. Best is trial 0 with value: 0.7349283695220947.


{'train_runtime': 6294.9242, 'train_samples_per_second': 413.031, 'train_steps_per_second': 12.907, 'train_loss': 2.6820999547786344, 'epoch': 4.0}
Best Hyperparameters:
{'learning_rate': 2.7453151590160654e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.026147972761883034}
