In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict

df = pd.read_csv("data.csv")

full_ds = Dataset.from_pandas(df)
ds = full_ds.train_test_split(test_size=0.2, seed=42)

def normalize_label(example):
    val = example["label"]
    val = "" if val is None else str(val)
    example["label"] = val.strip().lower()
    return example

ds = ds.map(normalize_label)

all_labels = set(ds["train"]["label"]) | set(ds["test"]["label"])
label_names = sorted(all_labels) 
label2id = {name: i for i, name in enumerate(label_names)}
id2label = {i: name for name, i in label2id.items()}

def encode_labels(example):
    lbl = example["label"]
    if lbl not in label2id:
        raise ValueError(f"Unknown label after normalization: {lbl}")
    example["label"] = label2id[lbl]
    return example

ds = ds.map(encode_labels)

from transformers import AutoTokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

# 🔹 FIXED: use `ds` instead of undefined `dataset`
ds = ds.map(tokenize, batched=True)
ds = ds.remove_columns(["text"]) 
ds.set_format("torch")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 112/112 [00:00<00:00, 11374.66 examples/s]
Map: 100%|██████████| 29/29 [00:00<00:00, 2801.94 examples/s]
Map: 100%|██████████| 112/112 [00:00<00:00, 36899.07 examples/s]
Map: 100%|██████████| 29/29 [00:00<00:00, 2566.03 examples/s]
Map: 100%|██████████| 112/112 [00:00<00:00, 5513.32 examples/s]
Map: 100%|██████████| 29/29 [00:00<00:00, 2654.68 examples/s]


In [2]:
from transformers import AutoModelForSequenceClassification

num_labels = len(label_names) 
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }


training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],   
    eval_dataset=ds["test"],     
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss




TrainOutput(global_step=42, training_loss=0.6024882452828544, metrics={'train_runtime': 429.8819, 'train_samples_per_second': 0.782, 'train_steps_per_second': 0.098, 'total_flos': 44509045948416.0, 'train_loss': 0.6024882452828544, 'epoch': 3.0})

In [4]:
from transformers import pipeline
classifier = pipeline("text-classification", model=model,
tokenizer=tokenizer)
print(classifier("This product is amazing!"))
print(classifier("I will never buy this again."))

Device set to use cpu


[{'label': 'positive', 'score': 0.6681368350982666}]
[{'label': 'negative', 'score': 0.5593797564506531}]
