In [5]:
from datasets import Dataset
import pandas as pd

# Load CSV files into pandas DataFrames

train_df = pd.read_csv('tokenized_train.csv')
test_df = pd.read_csv('tokenized_test.csv')

# Convert DataFrames to datasets

tokenized_train_dataset = Dataset.from_pandas(train_df)
tokenized_test_dataset = Dataset.from_pandas(test_df)

# Print sample tokenized data

print("Sample tokenized text:", tokenized_train_dataset[0])

Sample tokenized text: {'text': 'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.', 'labels': 7, 'label_text': 'rec.autos', 'cleaned_text': 'I was wondering if anyone out there could enlighten me on this car I saw the other day It was a door sports car looked to be from the late s early s It was called a Bricklin The doors were really small In addition the front bumper was separate from the rest of the body This is all I know If anyone can tellme a model name engine specs years of production where this car is made history or whatever info you have on this funky

In [22]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
import datasets

# Reload tokenized datasets from %store

%store -r tokenized_train_dataset
%store -r tokenized_test_dataset

# Define the model

model_name = 'distilbert-base-uncased'
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=20)  # Adjust num_labels as per your dataset

# Training arguments (default)

training_args_default = TrainingArguments(
    output_dir='./results_default',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_default',
)

# Define Trainer

trainer_default = Trainer(
    model=model,
    args=training_args_default,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Start training

trainer_default.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2124 [00:00<?, ?it/s]

{'loss': 1.8406, 'grad_norm': 9.553842544555664, 'learning_rate': 1.529190207156309e-05, 'epoch': 0.71}


  0%|          | 0/471 [00:00<?, ?it/s]

{'eval_loss': 1.199527621269226, 'eval_runtime': 22.3077, 'eval_samples_per_second': 337.641, 'eval_steps_per_second': 21.114, 'epoch': 1.0}
{'loss': 1.0603, 'grad_norm': 7.716678142547607, 'learning_rate': 1.0583804143126177e-05, 'epoch': 1.41}


  0%|          | 0/471 [00:00<?, ?it/s]

{'eval_loss': 1.105939269065857, 'eval_runtime': 26.6128, 'eval_samples_per_second': 283.021, 'eval_steps_per_second': 17.698, 'epoch': 2.0}
{'loss': 0.8698, 'grad_norm': 9.898701667785645, 'learning_rate': 5.8757062146892665e-06, 'epoch': 2.12}
{'loss': 0.7128, 'grad_norm': 8.077910423278809, 'learning_rate': 1.167608286252354e-06, 'epoch': 2.82}


  0%|          | 0/471 [00:00<?, ?it/s]

{'eval_loss': 1.078869104385376, 'eval_runtime': 26.6869, 'eval_samples_per_second': 282.236, 'eval_steps_per_second': 17.649, 'epoch': 3.0}
{'train_runtime': 427.4488, 'train_samples_per_second': 79.406, 'train_steps_per_second': 4.969, 'train_loss': 1.096146456041354, 'epoch': 3.0}


TrainOutput(global_step=2124, training_loss=1.096146456041354, metrics={'train_runtime': 427.4488, 'train_samples_per_second': 79.406, 'train_steps_per_second': 4.969, 'total_flos': 1124412937021440.0, 'train_loss': 1.096146456041354, 'epoch': 3.0})

In [23]:
# Optimized training arguments

training_args_optimized = TrainingArguments(
    output_dir='./results_optimized',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs_optimized',
)

# Define Trainer for optimized training

trainer_optimized = Trainer(
    model=model,
    args=training_args_optimized,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

# Start optimized training

trainer_optimized.train()




  0%|          | 0/1770 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]

{'eval_loss': 1.1034184694290161, 'eval_runtime': 26.345, 'eval_samples_per_second': 285.899, 'eval_steps_per_second': 8.958, 'epoch': 1.0}
{'loss': 0.6438, 'grad_norm': 7.300913333892822, 'learning_rate': 3.587570621468927e-05, 'epoch': 1.41}


  0%|          | 0/236 [00:00<?, ?it/s]

{'eval_loss': 1.1710067987442017, 'eval_runtime': 31.0844, 'eval_samples_per_second': 242.308, 'eval_steps_per_second': 7.592, 'epoch': 2.0}
{'loss': 0.3667, 'grad_norm': 8.049430847167969, 'learning_rate': 2.175141242937853e-05, 'epoch': 2.82}


  0%|          | 0/236 [00:00<?, ?it/s]

{'eval_loss': 1.294228434562683, 'eval_runtime': 33.5971, 'eval_samples_per_second': 224.186, 'eval_steps_per_second': 7.024, 'epoch': 3.0}


  0%|          | 0/236 [00:00<?, ?it/s]

{'eval_loss': 1.3508108854293823, 'eval_runtime': 33.3228, 'eval_samples_per_second': 226.031, 'eval_steps_per_second': 7.082, 'epoch': 4.0}
{'loss': 0.1874, 'grad_norm': 6.068109512329102, 'learning_rate': 7.627118644067798e-06, 'epoch': 4.24}


  0%|          | 0/236 [00:00<?, ?it/s]

{'eval_loss': 1.410793423652649, 'eval_runtime': 33.6701, 'eval_samples_per_second': 223.7, 'eval_steps_per_second': 7.009, 'epoch': 5.0}
{'train_runtime': 790.442, 'train_samples_per_second': 71.568, 'train_steps_per_second': 2.239, 'train_loss': 0.35757455556406137, 'epoch': 5.0}


TrainOutput(global_step=1770, training_loss=0.35757455556406137, metrics={'train_runtime': 790.442, 'train_samples_per_second': 71.568, 'train_steps_per_second': 2.239, 'total_flos': 1874021561702400.0, 'train_loss': 0.35757455556406137, 'epoch': 5.0})

My defualt training did decent, but the optimized training did better as seen in the metrics such as training loss which was lower. Although the speed was faster for default, the less loss for optimized training alone makes up for and surpasses the default! I used the specific metrics i used because for one, accuarcy is a straightforward indicator of overall preformance. Two, f1 score is good for real world situations where there may be some imbalance with the datasets.