In [22]:
!pip uninstall -y torch torchvision torchaudio transformers datasets accelerate -q
!pip install --no-cache-dir \
    torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 \
    --index-url https://download.pytorch.org/whl/cpu -q
!pip install --no-cache-dir \
    transformers==4.40.2 datasets==2.19.0 accelerate==0.30.1 \
    scikit-learn pandas joblib -q
print("Installation done – **RESTART KERNEL NOW**")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Installation done – **RESTART KERNEL NOW**


In [23]:
import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"PyTorch {torch.__version__} – Using {device}")

PyTorch 2.3.0 – Using mps


In [24]:
import pandas as pd
df = pd.read_csv("all_messages_clean.csv")
df = df.rename(columns={'Label': 'label'})
df = df.rename(columns={'Text': 'text'})
df['text'] = df['text'].str.lower().str.replace(r'[^a-z0-9\s%₹]', '', regex=True)
print(f"Loaded {len(df)} SMS")
print(df['label'].value_counts())

Loaded 470 SMS
label
Telecom          152
Marketing        114
Banking           70
Shopping          55
Informational     38
Real Estate       29
Spam              12
Name: count, dtype: int64


In [25]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
print(f"Train: {len(train_df)} | Test: {len(test_df)}")

Train: 376 | Test: 94


In [26]:
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding=True, max_length=128)

train_ds = Dataset.from_pandas(train_df).map(tokenize, batched=True)
test_ds  = Dataset.from_pandas(test_df).map(tokenize, batched=True)

Map: 100%|██████████| 376/376 [00:00<00:00, 9185.55 examples/s]
Map: 100%|██████████| 94/94 [00:00<00:00, 13371.70 examples/s]


In [27]:
labels = sorted(df['label'].unique())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}

train_ds = train_ds.map(lambda x: {"labels": label2id[x["label"]]})
test_ds  = test_ds.map(lambda x: {"labels": label2id[x["label"]]})
train_ds = train_ds.remove_columns(["label"])
test_ds  = test_ds.remove_columns(["label"])
print("Labels → integers, string column removed")

Map: 100%|██████████| 376/376 [00:00<00:00, 19048.67 examples/s]
Map: 100%|██████████| 94/94 [00:00<00:00, 18939.55 examples/s]

Labels → integers, string column removed





In [28]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
).to(device)

print("Model loaded on", device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on mps


In [44]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

args = TrainingArguments(
    output_dir="bert_mps",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    report_to=[],
    logging_steps=5,
    gradient_accumulation_steps=2,
)

def compute_metrics(eval_pred):
    preds = eval_pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(eval_pred.label_ids, preds),
        "f1": f1_score(eval_pred.label_ids, preds, average="weighted")
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

In [46]:
print("Training on MPS GPU...")
trainer.train()

Training on MPS GPU...


 82%|████████▏ | 376/460 [14:21<03:12,  2.29s/it]

[A                                            

{'loss': 0.0133, 'grad_norm': 1.8950341939926147, 'learning_rate': 1.9782608695652176e-05, 'epoch': 0.21}



[A                                             

{'loss': 0.1053, 'grad_norm': 0.02738572657108307, 'learning_rate': 1.956521739130435e-05, 'epoch': 0.43}



[A                                             

{'loss': 0.054, 'grad_norm': 0.037173833698034286, 'learning_rate': 1.9347826086956523e-05, 'epoch': 0.64}



[A                                             

{'loss': 0.0465, 'grad_norm': 4.199322700500488, 'learning_rate': 1.9130434782608697e-05, 'epoch': 0.85}


 92%|█████████▏| 11/12 [00:00<00:00, 17.77it/s]
                                                

{'eval_loss': 1.2084136009216309, 'eval_accuracy': 0.7872340425531915, 'eval_f1': 0.7900319538617411, 'eval_runtime': 0.7207, 'eval_samples_per_second': 130.429, 'eval_steps_per_second': 16.651, 'epoch': 0.98}



[A                                             

{'loss': 0.0327, 'grad_norm': 0.09200073778629303, 'learning_rate': 1.891304347826087e-05, 'epoch': 1.06}



[A                                             

{'loss': 0.0268, 'grad_norm': 0.07568559050559998, 'learning_rate': 1.8695652173913045e-05, 'epoch': 1.28}



[A                                             

{'loss': 0.0223, 'grad_norm': 0.1139318197965622, 'learning_rate': 1.847826086956522e-05, 'epoch': 1.49}



[A                                             

{'loss': 0.0424, 'grad_norm': 0.4181343913078308, 'learning_rate': 1.8260869565217393e-05, 'epoch': 1.7}



[A                                             

{'loss': 0.127, 'grad_norm': 0.6220680475234985, 'learning_rate': 1.8043478260869567e-05, 'epoch': 1.91}


 92%|█████████▏| 11/12 [00:00<00:00, 18.20it/s]
                                                

{'eval_loss': 1.1838349103927612, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.8020979020979021, 'eval_runtime': 0.721, 'eval_samples_per_second': 130.383, 'eval_steps_per_second': 16.645, 'epoch': 2.0}



[A                                             

{'loss': 0.0082, 'grad_norm': 0.1383727639913559, 'learning_rate': 1.782608695652174e-05, 'epoch': 2.13}



[A                                             

{'loss': 0.0246, 'grad_norm': 0.06182905659079552, 'learning_rate': 1.7608695652173915e-05, 'epoch': 2.34}



[A                                             

{'loss': 0.0871, 'grad_norm': 2.752577543258667, 'learning_rate': 1.739130434782609e-05, 'epoch': 2.55}



[A                                             

{'loss': 0.0192, 'grad_norm': 2.144390821456909, 'learning_rate': 1.7173913043478263e-05, 'epoch': 2.77}



[A                                             

{'loss': 0.0923, 'grad_norm': 5.041346073150635, 'learning_rate': 1.6956521739130437e-05, 'epoch': 2.98}


 92%|█████████▏| 11/12 [00:00<00:00, 18.38it/s]
                                                

{'eval_loss': 1.2417266368865967, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.8020979020979021, 'eval_runtime': 0.6935, 'eval_samples_per_second': 135.549, 'eval_steps_per_second': 17.304, 'epoch': 2.98}



[A                                             

{'loss': 0.0765, 'grad_norm': 1.9515950679779053, 'learning_rate': 1.673913043478261e-05, 'epoch': 3.19}



[A                                             

{'loss': 0.0169, 'grad_norm': 1.673852562904358, 'learning_rate': 1.6521739130434785e-05, 'epoch': 3.4}



[A                                             

{'loss': 0.1481, 'grad_norm': 2.2543225288391113, 'learning_rate': 1.630434782608696e-05, 'epoch': 3.62}



[A                                             

{'loss': 0.038, 'grad_norm': 0.10696201771497726, 'learning_rate': 1.6086956521739132e-05, 'epoch': 3.83}


 92%|█████████▏| 11/12 [00:00<00:00, 17.88it/s]
                                                

{'eval_loss': 1.211171269416809, 'eval_accuracy': 0.776595744680851, 'eval_f1': 0.7758938042054034, 'eval_runtime': 0.7258, 'eval_samples_per_second': 129.513, 'eval_steps_per_second': 16.534, 'epoch': 4.0}



[A                                             

{'loss': 0.0131, 'grad_norm': 0.3956656754016876, 'learning_rate': 1.5869565217391306e-05, 'epoch': 4.04}



[A                                              

{'loss': 0.038, 'grad_norm': 0.06290310621261597, 'learning_rate': 1.565217391304348e-05, 'epoch': 4.26}



[A                                              

{'loss': 0.0906, 'grad_norm': 0.21180850267410278, 'learning_rate': 1.5434782608695654e-05, 'epoch': 4.47}



[A                                              

{'loss': 0.0633, 'grad_norm': 1.0785249471664429, 'learning_rate': 1.5217391304347828e-05, 'epoch': 4.68}



[A                                              

{'loss': 0.0161, 'grad_norm': 0.933840811252594, 'learning_rate': 1.5000000000000002e-05, 'epoch': 4.89}


 92%|█████████▏| 11/12 [00:00<00:00, 18.19it/s]
                                                 

{'eval_loss': 1.2621867656707764, 'eval_accuracy': 0.7872340425531915, 'eval_f1': 0.7900319538617411, 'eval_runtime': 0.6874, 'eval_samples_per_second': 136.754, 'eval_steps_per_second': 17.458, 'epoch': 4.98}



[A                                              

{'loss': 0.0218, 'grad_norm': 0.014811613596975803, 'learning_rate': 1.4782608695652174e-05, 'epoch': 5.11}



[A                                              

{'loss': 0.0354, 'grad_norm': 1.7470996379852295, 'learning_rate': 1.456521739130435e-05, 'epoch': 5.32}



[A                                              

{'loss': 0.0803, 'grad_norm': 1.9631949663162231, 'learning_rate': 1.4347826086956522e-05, 'epoch': 5.53}



[A                                              

{'loss': 0.0548, 'grad_norm': 3.667917490005493, 'learning_rate': 1.4130434782608698e-05, 'epoch': 5.74}



[A                                              

{'loss': 0.0295, 'grad_norm': 3.7607266902923584, 'learning_rate': 1.391304347826087e-05, 'epoch': 5.96}


 92%|█████████▏| 11/12 [00:00<00:00, 18.01it/s]
                                                 

{'eval_loss': 1.1803265810012817, 'eval_accuracy': 0.776595744680851, 'eval_f1': 0.7797694532593419, 'eval_runtime': 0.7272, 'eval_samples_per_second': 129.258, 'eval_steps_per_second': 16.501, 'epoch': 6.0}



[A                                              

{'loss': 0.0342, 'grad_norm': 0.49480462074279785, 'learning_rate': 1.3695652173913046e-05, 'epoch': 6.17}



[A                                              

{'loss': 0.0116, 'grad_norm': 0.6878849864006042, 'learning_rate': 1.3478260869565218e-05, 'epoch': 6.38}



[A                                              

{'loss': 0.0319, 'grad_norm': 1.824607253074646, 'learning_rate': 1.3260869565217392e-05, 'epoch': 6.6}



[A                                              

{'loss': 0.0467, 'grad_norm': 0.07373607903718948, 'learning_rate': 1.3043478260869566e-05, 'epoch': 6.81}


 92%|█████████▏| 11/12 [00:00<00:00, 18.35it/s]
                                                 

{'eval_loss': 1.1941026449203491, 'eval_accuracy': 0.776595744680851, 'eval_f1': 0.7771628184320775, 'eval_runtime': 0.6843, 'eval_samples_per_second': 137.375, 'eval_steps_per_second': 17.537, 'epoch': 6.98}



[A                                              

{'loss': 0.109, 'grad_norm': 3.3807883262634277, 'learning_rate': 1.282608695652174e-05, 'epoch': 7.02}



[A                                              

{'loss': 0.0887, 'grad_norm': 2.8297476768493652, 'learning_rate': 1.2608695652173915e-05, 'epoch': 7.23}



[A                                              

{'loss': 0.0136, 'grad_norm': 0.035588592290878296, 'learning_rate': 1.2391304347826088e-05, 'epoch': 7.45}



[A                                              

{'loss': 0.0118, 'grad_norm': 0.020507603883743286, 'learning_rate': 1.2173913043478263e-05, 'epoch': 7.66}



[A                                              

{'loss': 0.0688, 'grad_norm': 1.7588704824447632, 'learning_rate': 1.1956521739130435e-05, 'epoch': 7.87}


 92%|█████████▏| 11/12 [00:00<00:00, 18.39it/s]
                                                 

{'eval_loss': 1.2189505100250244, 'eval_accuracy': 0.7872340425531915, 'eval_f1': 0.7915386046778736, 'eval_runtime': 0.7091, 'eval_samples_per_second': 132.56, 'eval_steps_per_second': 16.923, 'epoch': 8.0}



[A                                              

{'loss': 0.0096, 'grad_norm': 0.44269129633903503, 'learning_rate': 1.1739130434782611e-05, 'epoch': 8.09}



[A                                              

{'loss': 0.1099, 'grad_norm': 2.5349271297454834, 'learning_rate': 1.1521739130434783e-05, 'epoch': 8.3}



[A                                              

{'loss': 0.0121, 'grad_norm': 0.5304360389709473, 'learning_rate': 1.1304347826086957e-05, 'epoch': 8.51}



[A                                              

{'loss': 0.0122, 'grad_norm': 0.02957838401198387, 'learning_rate': 1.1086956521739131e-05, 'epoch': 8.72}



[A                                              

{'loss': 0.057, 'grad_norm': 0.5521748065948486, 'learning_rate': 1.0869565217391305e-05, 'epoch': 8.94}


 92%|█████████▏| 11/12 [00:00<00:00, 18.28it/s]
                                                 

{'eval_loss': 1.2915974855422974, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.8028156054471844, 'eval_runtime': 0.6941, 'eval_samples_per_second': 135.419, 'eval_steps_per_second': 17.288, 'epoch': 8.98}



[A                                              

{'loss': 0.0143, 'grad_norm': 0.01923825778067112, 'learning_rate': 1.0652173913043479e-05, 'epoch': 9.15}



[A                                              

{'loss': 0.0211, 'grad_norm': 0.01819126307964325, 'learning_rate': 1.0434782608695653e-05, 'epoch': 9.36}



[A                                              

{'loss': 0.0082, 'grad_norm': 0.2501521110534668, 'learning_rate': 1.0217391304347829e-05, 'epoch': 9.57}



[A                                              

{'loss': 0.0317, 'grad_norm': 0.23620647192001343, 'learning_rate': 1e-05, 'epoch': 9.79}



[A                                              

{'loss': 0.1255, 'grad_norm': 0.5519219636917114, 'learning_rate': 9.782608695652175e-06, 'epoch': 10.0}


 92%|█████████▏| 11/12 [00:00<00:00, 18.12it/s]
                                                 

{'eval_loss': 1.243825912475586, 'eval_accuracy': 0.7872340425531915, 'eval_f1': 0.7894343121106839, 'eval_runtime': 0.6933, 'eval_samples_per_second': 135.583, 'eval_steps_per_second': 17.308, 'epoch': 10.0}



[A                                              

{'loss': 0.06, 'grad_norm': 0.025052735581994057, 'learning_rate': 9.565217391304349e-06, 'epoch': 10.21}



[A                                              

{'loss': 0.0426, 'grad_norm': 0.14808537065982819, 'learning_rate': 9.347826086956523e-06, 'epoch': 10.43}



[A                                              

{'loss': 0.0172, 'grad_norm': 1.3231894969940186, 'learning_rate': 9.130434782608697e-06, 'epoch': 10.64}



[A                                              

{'loss': 0.053, 'grad_norm': 0.03873186558485031, 'learning_rate': 8.91304347826087e-06, 'epoch': 10.85}


 92%|█████████▏| 11/12 [00:00<00:00, 18.33it/s]
                                                 

{'eval_loss': 1.3192940950393677, 'eval_accuracy': 0.7872340425531915, 'eval_f1': 0.7900319538617411, 'eval_runtime': 0.687, 'eval_samples_per_second': 136.834, 'eval_steps_per_second': 17.468, 'epoch': 10.98}



[A                                              

{'loss': 0.0073, 'grad_norm': 0.43264681100845337, 'learning_rate': 8.695652173913044e-06, 'epoch': 11.06}



[A                                              

{'loss': 0.0389, 'grad_norm': 1.6371560096740723, 'learning_rate': 8.478260869565218e-06, 'epoch': 11.28}



[A                                              

{'loss': 0.0295, 'grad_norm': 0.06298811733722687, 'learning_rate': 8.260869565217392e-06, 'epoch': 11.49}



[A                                              

{'loss': 0.0359, 'grad_norm': 0.021874254569411278, 'learning_rate': 8.043478260869566e-06, 'epoch': 11.7}



[A                                              

{'loss': 0.0642, 'grad_norm': 0.5781641006469727, 'learning_rate': 7.82608695652174e-06, 'epoch': 11.91}


 92%|█████████▏| 11/12 [00:00<00:00, 18.19it/s]
                                                 

{'eval_loss': 1.3407490253448486, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.8028156054471844, 'eval_runtime': 0.7178, 'eval_samples_per_second': 130.95, 'eval_steps_per_second': 16.717, 'epoch': 12.0}



[A                                              

{'loss': 0.0334, 'grad_norm': 0.010663563385605812, 'learning_rate': 7.608695652173914e-06, 'epoch': 12.13}



[A                                              

{'loss': 0.0297, 'grad_norm': 2.2364916801452637, 'learning_rate': 7.391304347826087e-06, 'epoch': 12.34}



[A                                              

{'loss': 0.0368, 'grad_norm': 1.4042214155197144, 'learning_rate': 7.173913043478261e-06, 'epoch': 12.55}



[A                                              

{'loss': 0.0569, 'grad_norm': 0.541593074798584, 'learning_rate': 6.956521739130435e-06, 'epoch': 12.77}



[A                                              

{'loss': 0.0099, 'grad_norm': 0.5575778484344482, 'learning_rate': 6.739130434782609e-06, 'epoch': 12.98}


 92%|█████████▏| 11/12 [00:00<00:00, 18.36it/s]
                                                 

{'eval_loss': 1.3140313625335693, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.8028156054471844, 'eval_runtime': 0.6871, 'eval_samples_per_second': 136.801, 'eval_steps_per_second': 17.464, 'epoch': 12.98}



[A                                              

{'loss': 0.0255, 'grad_norm': 0.03404785692691803, 'learning_rate': 6.521739130434783e-06, 'epoch': 13.19}



[A                                              

{'loss': 0.0508, 'grad_norm': 0.16661624610424042, 'learning_rate': 6.304347826086958e-06, 'epoch': 13.4}



[A                                              

{'loss': 0.0402, 'grad_norm': 0.029498446732759476, 'learning_rate': 6.086956521739132e-06, 'epoch': 13.62}



[A                                              

{'loss': 0.0065, 'grad_norm': 0.02218419872224331, 'learning_rate': 5.8695652173913055e-06, 'epoch': 13.83}


 92%|█████████▏| 11/12 [00:00<00:00, 17.76it/s]
                                                 

{'eval_loss': 1.2927534580230713, 'eval_accuracy': 0.7872340425531915, 'eval_f1': 0.7894343121106839, 'eval_runtime': 0.7253, 'eval_samples_per_second': 129.602, 'eval_steps_per_second': 16.545, 'epoch': 14.0}



[A                                              

{'loss': 0.0715, 'grad_norm': 1.6817858219146729, 'learning_rate': 5.652173913043479e-06, 'epoch': 14.04}



[A                                              

{'loss': 0.0873, 'grad_norm': 1.9300912618637085, 'learning_rate': 5.4347826086956525e-06, 'epoch': 14.26}



[A                                              

{'loss': 0.0052, 'grad_norm': 0.020677460357546806, 'learning_rate': 5.2173913043478265e-06, 'epoch': 14.47}



[A                                              

{'loss': 0.0339, 'grad_norm': 2.4068572521209717, 'learning_rate': 5e-06, 'epoch': 14.68}



[A                                              

{'loss': 0.0009, 'grad_norm': 0.01919175684452057, 'learning_rate': 4.782608695652174e-06, 'epoch': 14.89}


 92%|█████████▏| 11/12 [00:00<00:00, 18.02it/s]
                                                 

{'eval_loss': 1.314920425415039, 'eval_accuracy': 0.7872340425531915, 'eval_f1': 0.7907496572110233, 'eval_runtime': 0.703, 'eval_samples_per_second': 133.721, 'eval_steps_per_second': 17.071, 'epoch': 14.98}



[A                                              

{'loss': 0.0565, 'grad_norm': 0.01361989788711071, 'learning_rate': 4.565217391304348e-06, 'epoch': 15.11}



[A                                              

{'loss': 0.0512, 'grad_norm': 0.02231662906706333, 'learning_rate': 4.347826086956522e-06, 'epoch': 15.32}



[A                                              

{'loss': 0.0161, 'grad_norm': 0.03313300386071205, 'learning_rate': 4.130434782608696e-06, 'epoch': 15.53}



[A                                              

{'loss': 0.0302, 'grad_norm': 2.2497024536132812, 'learning_rate': 3.91304347826087e-06, 'epoch': 15.74}



[A                                              

{'loss': 0.0333, 'grad_norm': 0.009058643132448196, 'learning_rate': 3.6956521739130436e-06, 'epoch': 15.96}


 92%|█████████▏| 11/12 [00:00<00:00, 18.35it/s]
                                                 

{'eval_loss': 1.3349461555480957, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.8028156054471844, 'eval_runtime': 0.7135, 'eval_samples_per_second': 131.736, 'eval_steps_per_second': 16.817, 'epoch': 16.0}



[A                                              

{'loss': 0.012, 'grad_norm': 0.02182244136929512, 'learning_rate': 3.4782608695652175e-06, 'epoch': 16.17}



[A                                              

{'loss': 0.0028, 'grad_norm': 0.016515417024493217, 'learning_rate': 3.2608695652173914e-06, 'epoch': 16.38}



[A                                              

{'loss': 0.0105, 'grad_norm': 0.18952324986457825, 'learning_rate': 3.043478260869566e-06, 'epoch': 16.6}



[A                                              

{'loss': 0.0736, 'grad_norm': 1.892579197883606, 'learning_rate': 2.8260869565217393e-06, 'epoch': 16.81}


100%|██████████| 12/12 [00:00<00:00, 14.32it/s]
                                                 

{'eval_loss': 1.3593395948410034, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.8028156054471844, 'eval_runtime': 0.8937, 'eval_samples_per_second': 105.178, 'eval_steps_per_second': 13.427, 'epoch': 16.98}



[A                                              

{'loss': 0.0729, 'grad_norm': 0.47243815660476685, 'learning_rate': 2.6086956521739132e-06, 'epoch': 17.02}



[A                                              

{'loss': 0.0266, 'grad_norm': 1.5533534288406372, 'learning_rate': 2.391304347826087e-06, 'epoch': 17.23}



[A                                              

{'loss': 0.0361, 'grad_norm': 0.22680264711380005, 'learning_rate': 2.173913043478261e-06, 'epoch': 17.45}



[A                                              

{'loss': 0.0422, 'grad_norm': 2.6843502521514893, 'learning_rate': 1.956521739130435e-06, 'epoch': 17.66}



[A                                              

{'loss': 0.0409, 'grad_norm': 0.021669480949640274, 'learning_rate': 1.7391304347826088e-06, 'epoch': 17.87}


 92%|█████████▏| 11/12 [00:00<00:00, 10.52it/s]
                                                 

{'eval_loss': 1.3492560386657715, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.8028156054471844, 'eval_runtime': 1.1954, 'eval_samples_per_second': 78.633, 'eval_steps_per_second': 10.038, 'epoch': 18.0}



[A                                              

{'loss': 0.0161, 'grad_norm': 0.016092265024781227, 'learning_rate': 1.521739130434783e-06, 'epoch': 18.09}



[A                                              

{'loss': 0.0404, 'grad_norm': 0.29064345359802246, 'learning_rate': 1.3043478260869566e-06, 'epoch': 18.3}



[A                                              

{'loss': 0.0038, 'grad_norm': 0.017893295735120773, 'learning_rate': 1.0869565217391306e-06, 'epoch': 18.51}



[A                                              

{'loss': 0.1078, 'grad_norm': 0.013097778894007206, 'learning_rate': 8.695652173913044e-07, 'epoch': 18.72}



[A                                              

{'loss': 0.0133, 'grad_norm': 0.5807345509529114, 'learning_rate': 6.521739130434783e-07, 'epoch': 18.94}


100%|██████████| 12/12 [00:01<00:00,  8.32it/s]
                                                 

{'eval_loss': 1.3528090715408325, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.8028156054471844, 'eval_runtime': 1.7761, 'eval_samples_per_second': 52.925, 'eval_steps_per_second': 6.756, 'epoch': 18.98}



[A                                              

{'loss': 0.0141, 'grad_norm': 0.009959698654711246, 'learning_rate': 4.347826086956522e-07, 'epoch': 19.15}



[A                                              

{'loss': 0.0387, 'grad_norm': 1.775541067123413, 'learning_rate': 2.173913043478261e-07, 'epoch': 19.36}



[A                                              

{'loss': 0.0345, 'grad_norm': 0.19378423690795898, 'learning_rate': 0.0, 'epoch': 19.57}


100%|██████████| 12/12 [00:01<00:00,  8.65it/s]
                                                 

{'eval_loss': 1.3519965410232544, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.8028156054471844, 'eval_runtime': 1.512, 'eval_samples_per_second': 62.171, 'eval_steps_per_second': 7.937, 'epoch': 19.57}



100%|██████████| 460/460 [04:48<00:00,  1.60it/s]

{'train_runtime': 288.3658, 'train_samples_per_second': 26.078, 'train_steps_per_second': 1.595, 'train_loss': 0.04188261352297243, 'epoch': 19.57}





TrainOutput(global_step=460, training_loss=0.04188261352297243, metrics={'train_runtime': 288.3658, 'train_samples_per_second': 26.078, 'train_steps_per_second': 1.595, 'total_flos': 243761747312640.0, 'train_loss': 0.04188261352297243, 'epoch': 19.574468085106382})

In [31]:
print("Training on MPS GPU...")
trainer.train()

Training on MPS GPU...


  2%|▏         | 5/230 [00:03<01:53,  1.97it/s]

{'loss': 0.4097, 'grad_norm': 3.2584807872772217, 'learning_rate': 1.956521739130435e-05, 'epoch': 0.21}


  4%|▍         | 10/230 [00:05<01:37,  2.25it/s]

{'loss': 0.4492, 'grad_norm': 4.471360683441162, 'learning_rate': 1.9130434782608697e-05, 'epoch': 0.43}


  7%|▋         | 15/230 [00:07<01:32,  2.32it/s]

{'loss': 0.3889, 'grad_norm': 3.6879026889801025, 'learning_rate': 1.8695652173913045e-05, 'epoch': 0.64}


  9%|▊         | 20/230 [00:09<01:29,  2.35it/s]

{'loss': 0.3246, 'grad_norm': 5.04911470413208, 'learning_rate': 1.8260869565217393e-05, 'epoch': 0.85}


 10%|█         | 23/230 [00:10<01:27,  2.38it/s]
 10%|█         | 23/230 [00:11<01:27,  2.38it/s]

{'eval_loss': 0.7580357789993286, 'eval_accuracy': 0.7872340425531915, 'eval_f1': 0.7814488598080465, 'eval_runtime': 0.688, 'eval_samples_per_second': 136.631, 'eval_steps_per_second': 17.442, 'epoch': 0.98}


 11%|█         | 25/230 [00:15<03:57,  1.16s/it]

{'loss': 0.4069, 'grad_norm': 4.834653854370117, 'learning_rate': 1.782608695652174e-05, 'epoch': 1.06}


 13%|█▎        | 30/230 [00:17<01:48,  1.84it/s]

{'loss': 0.3306, 'grad_norm': 1.5567257404327393, 'learning_rate': 1.739130434782609e-05, 'epoch': 1.28}


 15%|█▌        | 35/230 [00:19<01:36,  2.01it/s]

{'loss': 0.2316, 'grad_norm': 4.704488754272461, 'learning_rate': 1.6956521739130437e-05, 'epoch': 1.49}


 17%|█▋        | 40/230 [00:21<01:24,  2.25it/s]

{'loss': 0.253, 'grad_norm': 1.8871983289718628, 'learning_rate': 1.6521739130434785e-05, 'epoch': 1.7}


 20%|█▉        | 45/230 [00:23<01:20,  2.31it/s]

{'loss': 0.3275, 'grad_norm': 6.004298686981201, 'learning_rate': 1.6086956521739132e-05, 'epoch': 1.91}


 20%|██        | 47/230 [00:24<01:19,  2.31it/s]
 20%|██        | 47/230 [00:25<01:19,  2.31it/s]

{'eval_loss': 0.7523419260978699, 'eval_accuracy': 0.7659574468085106, 'eval_f1': 0.7616408768536428, 'eval_runtime': 0.7136, 'eval_samples_per_second': 131.723, 'eval_steps_per_second': 16.816, 'epoch': 2.0}


 22%|██▏       | 50/230 [00:30<03:24,  1.14s/it]

{'loss': 0.2446, 'grad_norm': 4.366175174713135, 'learning_rate': 1.565217391304348e-05, 'epoch': 2.13}


 24%|██▍       | 55/230 [00:32<01:35,  1.83it/s]

{'loss': 0.168, 'grad_norm': 3.51638126373291, 'learning_rate': 1.5217391304347828e-05, 'epoch': 2.34}


 26%|██▌       | 60/230 [00:35<01:17,  2.20it/s]

{'loss': 0.1949, 'grad_norm': 3.108386516571045, 'learning_rate': 1.4782608695652174e-05, 'epoch': 2.55}


 28%|██▊       | 65/230 [00:37<01:12,  2.28it/s]

{'loss': 0.2068, 'grad_norm': 5.5532026290893555, 'learning_rate': 1.4347826086956522e-05, 'epoch': 2.77}


 30%|███       | 70/230 [00:39<01:08,  2.32it/s]

{'loss': 0.2019, 'grad_norm': 5.648699760437012, 'learning_rate': 1.391304347826087e-05, 'epoch': 2.98}



 30%|███       | 70/230 [00:40<01:08,  2.32it/s]

{'eval_loss': 0.7929103970527649, 'eval_accuracy': 0.7446808510638298, 'eval_f1': 0.7439987504745871, 'eval_runtime': 0.6903, 'eval_samples_per_second': 136.181, 'eval_steps_per_second': 17.385, 'epoch': 2.98}


 33%|███▎      | 75/230 [00:44<01:37,  1.59it/s]

{'loss': 0.2319, 'grad_norm': 2.8755900859832764, 'learning_rate': 1.3478260869565218e-05, 'epoch': 3.19}


 35%|███▍      | 80/230 [00:46<01:09,  2.16it/s]

{'loss': 0.1212, 'grad_norm': 2.1999032497406006, 'learning_rate': 1.3043478260869566e-05, 'epoch': 3.4}


 37%|███▋      | 85/230 [00:48<01:02,  2.30it/s]

{'loss': 0.2727, 'grad_norm': 4.17965841293335, 'learning_rate': 1.2608695652173915e-05, 'epoch': 3.62}


 39%|███▉      | 90/230 [00:50<00:59,  2.34it/s]

{'loss': 0.1842, 'grad_norm': 9.584864616394043, 'learning_rate': 1.2173913043478263e-05, 'epoch': 3.83}


 41%|████      | 94/230 [00:52<00:58,  2.34it/s]
 41%|████      | 94/230 [00:53<00:58,  2.34it/s]

{'eval_loss': 0.791930079460144, 'eval_accuracy': 0.7659574468085106, 'eval_f1': 0.7626619784995173, 'eval_runtime': 0.7127, 'eval_samples_per_second': 131.889, 'eval_steps_per_second': 16.837, 'epoch': 4.0}


 41%|████▏     | 95/230 [00:56<03:26,  1.53s/it]

{'loss': 0.1187, 'grad_norm': 1.608659029006958, 'learning_rate': 1.1739130434782611e-05, 'epoch': 4.04}


 43%|████▎     | 100/230 [00:58<01:19,  1.63it/s]

{'loss': 0.1479, 'grad_norm': 1.2835084199905396, 'learning_rate': 1.1304347826086957e-05, 'epoch': 4.26}


 46%|████▌     | 105/230 [01:00<00:58,  2.15it/s]

{'loss': 0.2156, 'grad_norm': 0.9907624125480652, 'learning_rate': 1.0869565217391305e-05, 'epoch': 4.47}


 48%|████▊     | 110/230 [01:03<00:51,  2.31it/s]

{'loss': 0.1161, 'grad_norm': 2.3753373622894287, 'learning_rate': 1.0434782608695653e-05, 'epoch': 4.68}


 50%|█████     | 115/230 [01:05<00:49,  2.34it/s]

{'loss': 0.0838, 'grad_norm': 1.0835294723510742, 'learning_rate': 1e-05, 'epoch': 4.89}


 51%|█████     | 117/230 [01:05<00:48,  2.35it/s]
 51%|█████     | 117/230 [01:06<00:48,  2.35it/s]

{'eval_loss': 0.7780247926712036, 'eval_accuracy': 0.7446808510638298, 'eval_f1': 0.7418631451777028, 'eval_runtime': 0.69, 'eval_samples_per_second': 136.234, 'eval_steps_per_second': 17.392, 'epoch': 4.98}


 52%|█████▏    | 120/230 [01:11<01:54,  1.04s/it]

{'loss': 0.0903, 'grad_norm': 3.119603395462036, 'learning_rate': 9.565217391304349e-06, 'epoch': 5.11}


 54%|█████▍    | 125/230 [01:13<00:55,  1.89it/s]

{'loss': 0.098, 'grad_norm': 4.876749038696289, 'learning_rate': 9.130434782608697e-06, 'epoch': 5.32}


 57%|█████▋    | 130/230 [01:15<00:44,  2.26it/s]

{'loss': 0.1561, 'grad_norm': 6.565861701965332, 'learning_rate': 8.695652173913044e-06, 'epoch': 5.53}


 59%|█████▊    | 135/230 [01:17<00:40,  2.34it/s]

{'loss': 0.1199, 'grad_norm': 9.018391609191895, 'learning_rate': 8.260869565217392e-06, 'epoch': 5.74}


 61%|██████    | 140/230 [01:19<00:38,  2.35it/s]

{'loss': 0.112, 'grad_norm': 7.800971984863281, 'learning_rate': 7.82608695652174e-06, 'epoch': 5.96}


 61%|██████▏   | 141/230 [01:20<00:37,  2.37it/s]
 61%|██████▏   | 141/230 [01:21<00:37,  2.37it/s]

{'eval_loss': 0.8578843474388123, 'eval_accuracy': 0.776595744680851, 'eval_f1': 0.7835119351993892, 'eval_runtime': 0.7305, 'eval_samples_per_second': 128.687, 'eval_steps_per_second': 16.428, 'epoch': 6.0}


 63%|██████▎   | 145/230 [01:26<01:15,  1.12it/s]

{'loss': 0.0798, 'grad_norm': 1.7496470212936401, 'learning_rate': 7.391304347826087e-06, 'epoch': 6.17}


 65%|██████▌   | 150/230 [01:28<00:40,  1.98it/s]

{'loss': 0.0759, 'grad_norm': 0.87253338098526, 'learning_rate': 6.956521739130435e-06, 'epoch': 6.38}


 67%|██████▋   | 155/230 [01:30<00:33,  2.25it/s]

{'loss': 0.1239, 'grad_norm': 4.830835819244385, 'learning_rate': 6.521739130434783e-06, 'epoch': 6.6}


 70%|██████▉   | 160/230 [01:33<00:30,  2.32it/s]

{'loss': 0.1071, 'grad_norm': 0.5738077163696289, 'learning_rate': 6.086956521739132e-06, 'epoch': 6.81}


 71%|███████▏  | 164/230 [01:34<00:28,  2.32it/s]
 71%|███████▏  | 164/230 [01:35<00:28,  2.32it/s]

{'eval_loss': 0.8291082382202148, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.8032598302330628, 'eval_runtime': 0.6887, 'eval_samples_per_second': 136.497, 'eval_steps_per_second': 17.425, 'epoch': 6.98}


 72%|███████▏  | 165/230 [01:40<02:03,  1.90s/it]

{'loss': 0.1535, 'grad_norm': 6.359257221221924, 'learning_rate': 5.652173913043479e-06, 'epoch': 7.02}


 74%|███████▍  | 170/230 [01:42<00:40,  1.48it/s]

{'loss': 0.1562, 'grad_norm': 6.644145965576172, 'learning_rate': 5.2173913043478265e-06, 'epoch': 7.23}


 76%|███████▌  | 175/230 [01:44<00:25,  2.15it/s]

{'loss': 0.0771, 'grad_norm': 0.38424232602119446, 'learning_rate': 4.782608695652174e-06, 'epoch': 7.45}


 78%|███████▊  | 180/230 [01:46<00:21,  2.31it/s]

{'loss': 0.0636, 'grad_norm': 2.35212779045105, 'learning_rate': 4.347826086956522e-06, 'epoch': 7.66}


 80%|████████  | 185/230 [01:48<00:19,  2.35it/s]

{'loss': 0.1002, 'grad_norm': 5.269256591796875, 'learning_rate': 3.91304347826087e-06, 'epoch': 7.87}


 82%|████████▏ | 188/230 [01:49<00:17,  2.35it/s]
 82%|████████▏ | 188/230 [01:50<00:17,  2.35it/s]

{'eval_loss': 0.8061166405677795, 'eval_accuracy': 0.7872340425531915, 'eval_f1': 0.7910917076727787, 'eval_runtime': 0.7037, 'eval_samples_per_second': 133.582, 'eval_steps_per_second': 17.053, 'epoch': 8.0}


 83%|████████▎ | 190/230 [01:55<00:53,  1.34s/it]

{'loss': 0.0575, 'grad_norm': 4.231607913970947, 'learning_rate': 3.4782608695652175e-06, 'epoch': 8.09}


 85%|████████▍ | 195/230 [01:57<00:20,  1.73it/s]

{'loss': 0.1514, 'grad_norm': 7.606069087982178, 'learning_rate': 3.043478260869566e-06, 'epoch': 8.3}


 87%|████████▋ | 200/230 [01:59<00:13,  2.20it/s]

{'loss': 0.0449, 'grad_norm': 1.087263822555542, 'learning_rate': 2.6086956521739132e-06, 'epoch': 8.51}


 89%|████████▉ | 205/230 [02:01<00:10,  2.32it/s]

{'loss': 0.0428, 'grad_norm': 0.4139881730079651, 'learning_rate': 2.173913043478261e-06, 'epoch': 8.72}


 91%|█████████▏| 210/230 [02:03<00:08,  2.33it/s]

{'loss': 0.0828, 'grad_norm': 0.44327273964881897, 'learning_rate': 1.7391304347826088e-06, 'epoch': 8.94}


 92%|█████████▏| 211/230 [02:04<00:08,  2.34it/s]
 92%|█████████▏| 211/230 [02:04<00:08,  2.34it/s]

{'eval_loss': 0.8237783312797546, 'eval_accuracy': 0.776595744680851, 'eval_f1': 0.7822372662798195, 'eval_runtime': 0.6903, 'eval_samples_per_second': 136.175, 'eval_steps_per_second': 17.384, 'epoch': 8.98}


 93%|█████████▎| 215/230 [02:12<00:16,  1.10s/it]

{'loss': 0.0554, 'grad_norm': 0.3317776024341583, 'learning_rate': 1.3043478260869566e-06, 'epoch': 9.15}


 96%|█████████▌| 220/230 [02:14<00:05,  1.86it/s]

{'loss': 0.0519, 'grad_norm': 0.4174441695213318, 'learning_rate': 8.695652173913044e-07, 'epoch': 9.36}


 98%|█████████▊| 225/230 [02:16<00:02,  2.25it/s]

{'loss': 0.0681, 'grad_norm': 0.9304336905479431, 'learning_rate': 4.347826086956522e-07, 'epoch': 9.57}


100%|██████████| 230/230 [02:18<00:00,  2.34it/s]

{'loss': 0.0789, 'grad_norm': 0.959475576877594, 'learning_rate': 0.0, 'epoch': 9.79}



100%|██████████| 230/230 [02:19<00:00,  2.34it/s]

{'eval_loss': 0.8268794417381287, 'eval_accuracy': 0.7872340425531915, 'eval_f1': 0.7935861705594033, 'eval_runtime': 0.6782, 'eval_samples_per_second': 138.6, 'eval_steps_per_second': 17.694, 'epoch': 9.79}


100%|██████████| 230/230 [02:23<00:00,  1.60it/s]

{'train_runtime': 143.8856, 'train_samples_per_second': 26.132, 'train_steps_per_second': 1.598, 'train_loss': 0.1690822545600974, 'epoch': 9.79}





TrainOutput(global_step=230, training_loss=0.1690822545600974, metrics={'train_runtime': 143.8856, 'train_samples_per_second': 26.132, 'train_steps_per_second': 1.598, 'total_flos': 121880873656320.0, 'train_loss': 0.1690822545600974, 'epoch': 9.787234042553191})

In [47]:
from IPython.display import display

print("EVALUATION RESULTS PER EPOCH:")
for log in trainer.state.log_history:
    if 'eval_accuracy' in log:
        epoch = log.get('epoch', 'N/A')
        acc = log['eval_accuracy']
        print(f"Epoch {epoch}: Accuracy = {acc:.1%}")

EVALUATION RESULTS PER EPOCH:
Epoch 0.9787234042553191: Accuracy = 78.7%
Epoch 2.0: Accuracy = 79.8%
Epoch 2.978723404255319: Accuracy = 79.8%
Epoch 4.0: Accuracy = 77.7%
Epoch 4.9787234042553195: Accuracy = 78.7%
Epoch 6.0: Accuracy = 77.7%
Epoch 6.9787234042553195: Accuracy = 77.7%
Epoch 8.0: Accuracy = 78.7%
Epoch 8.97872340425532: Accuracy = 79.8%
Epoch 10.0: Accuracy = 78.7%
Epoch 10.97872340425532: Accuracy = 78.7%
Epoch 12.0: Accuracy = 79.8%
Epoch 12.97872340425532: Accuracy = 79.8%
Epoch 14.0: Accuracy = 78.7%
Epoch 14.97872340425532: Accuracy = 78.7%
Epoch 16.0: Accuracy = 79.8%
Epoch 16.97872340425532: Accuracy = 79.8%
Epoch 18.0: Accuracy = 79.8%
Epoch 18.97872340425532: Accuracy = 79.8%
Epoch 19.574468085106382: Accuracy = 79.8%


In [48]:
final_metrics = trainer.evaluate()
print(f"\nFINAL TEST ACCURACY: {final_metrics['eval_accuracy']:.1%}")

100%|██████████| 12/12 [00:00<00:00, 18.36it/s]


FINAL TEST ACCURACY: 79.8%





In [49]:
from sklearn.metrics import classification_report
import numpy as np

preds = trainer.predict(test_ds)
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = preds.label_ids

print(classification_report(true_labels, pred_labels, target_names=labels))

100%|██████████| 12/12 [00:00<00:00, 18.51it/s]

               precision    recall  f1-score   support

      Banking       0.71      0.71      0.71        14
Informational       0.64      0.88      0.74         8
    Marketing       0.81      0.74      0.77        23
  Real Estate       1.00      1.00      1.00         6
     Shopping       0.60      0.82      0.69        11
         Spam       1.00      1.00      1.00         2
      Telecom       0.96      0.80      0.87        30

     accuracy                           0.80        94
    macro avg       0.82      0.85      0.83        94
 weighted avg       0.82      0.80      0.80        94






In [50]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        pred = torch.argmax(model(**inputs).logits, dim=1).item()
    return id2label[pred]

print(predict("Get 50% off on recharge!"))           # telecom
print(predict("WIN IPHONE FREE!"))                   # Marketing
print(predict("2 BHK flat in Pune"))                 # real_estate
print(predict("EMI due tomorrow"))                   # banking
print(predict("Buy shoes at 40% off"))               # shopping

Telecom
Marketing
Real Estate
Marketing
Shopping


In [51]:
model.save_pretrained("bert_sms_final")      # ← creates pytorch_model.bin
tokenizer.save_pretrained("bert_sms_final")
import joblib
joblib.dump(id2label, "bert_sms_final/id2label.pkl")

['bert_sms_final/id2label.pkl']