In [13]:
import pandas as pd

train_df = pd.read_csv('data/training_data.csv')
test_df = pd.read_csv('data/test_data.csv')

In [16]:
from datasets import Dataset
from transformers import AutoTokenizer

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
train_df['label'] = train_df['label'].astype(int)
test_df['label'] = test_df['label'].astype(int)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [17]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
import numpy as np
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [19]:
id2label = {0: "World", 1: "SPORTS", 2: "BUSINESS", 3: "SCI/TECH"}
label2id = {"World": 0, "SPORTS": 1, "BUSINESS": 2, "SCI/TECH": 3}

In [20]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",  # Corrected from eval_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,  # Use the tokenized datasets
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,  # Corrected from processing_class
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

  trainer = Trainer(


  0%|          | 0/15000 [00:00<?, ?it/s]

{'loss': 0.4237, 'grad_norm': 2.833117961883545, 'learning_rate': 1.9333333333333333e-05, 'epoch': 0.07}
{'loss': 0.2688, 'grad_norm': 3.2332396507263184, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.13}
{'loss': 0.2388, 'grad_norm': 23.668598175048828, 'learning_rate': 1.8e-05, 'epoch': 0.2}
{'loss': 0.2265, 'grad_norm': 6.600277423858643, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.27}
{'loss': 0.2309, 'grad_norm': 0.37740713357925415, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.33}
{'loss': 0.2384, 'grad_norm': 9.076156616210938, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.4}
{'loss': 0.2075, 'grad_norm': 7.358345985412598, 'learning_rate': 1.5333333333333334e-05, 'epoch': 0.47}
{'loss': 0.2162, 'grad_norm': 8.738639831542969, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.53}
{'loss': 0.2081, 'grad_norm': 13.335493087768555, 'learning_rate': 1.4e-05, 'epoch': 0.6}
{'loss': 0.1935, 'grad_norm': 7.335050106048584, 'learning_rate': 1.3333333333333333

  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.2775806486606598, 'eval_accuracy': 0.85, 'eval_f1': 0.8494949494949495, 'eval_precision': 0.8583333333333334, 'eval_recall': 0.85, 'eval_runtime': 0.5731, 'eval_samples_per_second': 34.896, 'eval_steps_per_second': 3.49, 'epoch': 1.0}
{'loss': 0.1446, 'grad_norm': 5.626241207122803, 'learning_rate': 9.333333333333334e-06, 'epoch': 1.07}
{'loss': 0.145, 'grad_norm': 6.456864833831787, 'learning_rate': 8.666666666666668e-06, 'epoch': 1.13}
{'loss': 0.1349, 'grad_norm': 10.182358741760254, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.2}
{'loss': 0.1394, 'grad_norm': 0.04425851255655289, 'learning_rate': 7.333333333333333e-06, 'epoch': 1.27}
{'loss': 0.1347, 'grad_norm': 2.49580979347229, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.33}
{'loss': 0.1375, 'grad_norm': 4.438701629638672, 'learning_rate': 6e-06, 'epoch': 1.4}
{'loss': 0.1453, 'grad_norm': 5.244363784790039, 'learning_rate': 5.333333333333334e-06, 'epoch': 1.47}
{'loss': 0.1234, 'grad_norm': 6.197744

  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.7610310912132263, 'eval_accuracy': 0.75, 'eval_f1': 0.7443181818181819, 'eval_precision': 0.7827380952380952, 'eval_recall': 0.75, 'eval_runtime': 0.1849, 'eval_samples_per_second': 108.157, 'eval_steps_per_second': 10.816, 'epoch': 2.0}
{'train_runtime': 4771.0294, 'train_samples_per_second': 50.304, 'train_steps_per_second': 3.144, 'train_loss': 0.1823235850016276, 'epoch': 2.0}


TrainOutput(global_step=15000, training_loss=0.1823235850016276, metrics={'train_runtime': 4771.0294, 'train_samples_per_second': 50.304, 'train_steps_per_second': 3.144, 'total_flos': 7948327403520000.0, 'train_loss': 0.1823235850016276, 'epoch': 2.0})

In [None]:
{'eval_loss': 0.7610310912132263, 'eval_accuracy': 0.75, 'eval_f1': 0.7443181818181819, 'eval_precision': 0.7827380952380952, 'eval_recall': 0.75, 'eval_runtime': 0.1849, 'eval_samples_per_second': 108.157, 'eval_steps_per_second': 10.816, 'epoch': 2.0}
{'train_runtime': 4771.0294, 'train_samples_per_second': 50.304, 'train_steps_per_second': 3.144, 'train_loss': 0.1823235850016276, 'epoch': 2.0}
(global_step=15000, training_loss=0.1823235850016276, metrics={'train_runtime': 4771.0294, 'train_samples_per_second': 50.304, 'train_steps_per_second': 3.144, 'total_flos': 7948327403520000.0, 'train_loss': 0.1823235850016276, 'epoch': 2.0})