In [1]:
# Installations
!pip install -q datasets transformers
!pip install git+https://github.com/huggingface/accelerate
!pip install evaluate
!pip install peft

Collecting git+https://github.com/huggingface/accelerate
  Cloning https://github.com/huggingface/accelerate to /tmp/pip-req-build-ttbvt6q2
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate /tmp/pip-req-build-ttbvt6q2
  Resolved https://github.com/huggingface/accelerate to commit 07ce74868cf0197a43dfa7aaf120384ec5a4afd8
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate==1.9.0.dev0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate==1.9.0.dev0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate==1.9.0.de

In [38]:
# import libraries
import pandas as pd
import ast
import os
from sklearn.metrics import classification_report
import evaluate
import numpy as np
import torch
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig

**Dataset**

In [2]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Data
!unzip "/content/drive/MyDrive/Projects/smsFinance/Data-20250705T071104Z-1-001.zip" -d "/content/annotations_data"

Archive:  /content/drive/MyDrive/Projects/smsFinance/KaggleData-20250705T071104Z-1-001.zip
  inflating: /content/annotations_data/KaggleData/clean_txn_ner_kaggle_v1.csv  
  inflating: /content/annotations_data/KaggleData/process_annotations.ipynb  
  inflating: /content/annotations_data/KaggleData/ner_txn_kaggle_v2.csv  
  inflating: /content/annotations_data/KaggleData/RawData/ner_3txn_combine_v0.csv  
  inflating: /content/annotations_data/KaggleData/RawData/train_sms_mine.csv  
  inflating: /content/annotations_data/KaggleData/RawData/SMS-Data.csv  
  inflating: /content/annotations_data/KaggleData/ner_txn_kaggle_v0.csv  


In [7]:
# load data
data_path = "/content/annotations_data/RawData/sms_data.csv" # change file path
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Label,Message
0,Delivery,"Dear Guest, Thanks for choosing The Edison. Or..."
1,Hotel,Welcome to Southern University at Shreveport R...
2,Payment,Bill dated 2013-01-14 for Rs 626 has been gene...
3,Payment,Bill dated 2014-11-30 for Rs 323 has been gene...
4,Appointment,"DEAR Zachary, THANK YOU FOR YOUR BOOKING (ORDE..."


In [8]:
# number of samples per class
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
info,13375
ham,10000
spam,6625
Appointment,1000
Delivery,1000
Hotel,1000
Payment,1000
Flight,1000
PickUp,1000
Train,1000


In [9]:
# label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

label_column_name = "Label"
le.fit(df[label_column_name].tolist())
df['label_encoding'] = le.transform(df[label_column_name].tolist())

In [12]:
df.head()

Unnamed: 0,Label,Message,label_encoding
0,Delivery,"Dear Guest, Thanks for choosing The Edison. Or...",3
1,Hotel,Welcome to Southern University at Shreveport R...,6
2,Payment,Bill dated 2013-01-14 for Rs 626 has been gene...,8
3,Payment,Bill dated 2014-11-30 for Rs 323 has been gene...,8
4,Appointment,"DEAR Zachary, THANK YOU FOR YOUR BOOKING (ORDE...",0


In [13]:
# for mapping label encoding id and label
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
info,13375
ham,10000
spam,6625
Appointment,1000
Delivery,1000
Hotel,1000
Payment,1000
Flight,1000
PickUp,1000
Train,1000


In [14]:
df['label_encoding'].value_counts()

Unnamed: 0_level_0,count
label_encoding,Unnamed: 1_level_1
13,13375
12,10000
14,6625
0,1000
3,1000
6,1000
8,1000
5,1000
9,1000
11,1000


In [15]:
# mapping id and label
label2id = {'Appointment': 0, 'Bus': 1, 'Cab': 2, 'Delivery': 3, 'Expiry': 4, 'Flight': 5, 'Hotel': 6, 'Movie': 7, 'Payment': 8, 'PickUp': 9, 'Reservation': 10, 'Train': 11, 'ham': 12, 'info': 13, 'spam': 14 }
id2label = { 0: 'Appointment', 1: 'Bus', 2: 'Cab', 3: 'Delivery', 4: 'Expiry', 5: 'Flight', 6: 'Hotel', 7: 'Movie', 8: 'Payment', 9: 'PickUp', 10: 'Reservation', 11: 'Train', 12: 'ham', 13: 'info', 14: 'spam' }

In [16]:
# shuffle
dataframe = df.sample(frac=1).reset_index(drop=True)
dataframe.head()

Unnamed: 0,Label,Message,label_encoding
0,spam,"Greetings, Vodafone wishes you a Happy Gandhi ...",14
1,ham,john will meet me on this sat and next saturday,12
2,info,Appointment with Dr Collin in CHANNARAYAPATNA ...,13
3,ham,rohini's party at 6:30 tomorrow,12
4,spam,"Dear JET Airways Customer, flight 1C384 from S...",14


In [17]:
# drop duplicates
dataset = dataframe.drop_duplicates(keep='first', inplace=False)
dataset.shape #

(41921, 3)

In [18]:
# split data in train and test set
from sklearn.model_selection import train_test_split
df_train,df_test = train_test_split(dataset,test_size=0.2,random_state=42)

In [20]:
# convert all data samples to string type
df_train['Message'] = df_train['Message'].astype(str)
df_test['Message'] = df_test['Message'].astype(str)

In [21]:
# convert data samples into Dataset format to speed up training process
from datasets import Dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [24]:
# sample training data format
train_dataset[5]

{'Label': 'Delivery',
 'Message': 'Dear Guest, Thanks for choosing Marrybrown. Order ID 7631. Delivery by 2014-07-14 21:24. Enjoy 27% discount on your next purchase. Code TWI9XS. T&C.',
 'label_encoding': 3,
 '__index_level_0__': 21630}

**Model**

In [25]:
# load tokenizer
#model_name = "distilbert-base-uncased" # original Model -> ~1GB
model_name = "huawei-noah/TinyBERT_General_4L_312D" # relatively gives same performance -> ~130MB
model_name = 'distilbert-base-uncased' # Smaller model, you can alternatively use above models
tokenizer = AutoTokenizer.from_pretrained(model_name, padding="max_length",
                                          truncation=True, max_length=128, model_max_length=128,
                                          return_tensors="pt", add_prefix_space=True) # max_length = 128 tokens

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

**Pre-process data**

In [45]:
# map training samples to tokens
def preprocess_function(examples):
  tokens = tokenizer(examples["Message"], padding="max_length", truncation=True)
  tokens["labels"] = examples["label_encoding"]   # label encoder
  return tokens

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/33536 [00:00<?, ? examples/s]

Map:   0%|          | 0/8385 [00:00<?, ? examples/s]

In [46]:
# check sequence length
len(tokenized_train[9]['input_ids'])

128

In [47]:
# check maximum sequence length
print(tokenizer.model_max_length)

128


In [55]:
# load model - mention number of classes
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=15,ignore_mismatched_sizes=True,
                                                           id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
# data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [57]:
# accuracy metrics function to pass into trainer
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

**Apply untrained model to text**

In [51]:
test_untrained = df_test['Message'][5:10].tolist()
test_untrained

['OTP is 667778 for the txn of INR 3721.00 at True Value on your AXIS bank CREDIT card ending with 5474. Valid till 6:38:29. Do not share the OTP with anyone for security reasons',
 'Your Tkt Cancelled. PNR, 12136726454, Amt 521 will be refunded in your account.',
 '* KSRTC m-Ticket *  from: KOLHAPUR to: THRISSUR PsngrName: Clayton TripCode: 1446KOLTHR PNR No.: J59643129 JnyDate: 2015-03-05 DepTime: 14:46  SeatNo.: 16, 19, 50, 51, 21  Class: AC chair  BoardingPt: KOLHAPUR Residency Road Txn Password: 8274 . Please carry your photo ID during journey. T&C apply. Visit  www.ksrtc.in',
 'Next Thursday at 9 pm',
 'PNR:7214716784,TRAIN:5676,DOJ:2015-08-19,FC,BF-BMC,Dep:21:42, Trenton,F2 59 , Fare:2005,SC:10.0+PG+INS']

In [52]:
# define list of examples
text_list = test_untrained

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
OTP is 667778 for the txn of INR 3721.00 at True Value on your AXIS bank CREDIT card ending with 5474. Valid till 6:38:29. Do not share the OTP with anyone for security reasons - info
Your Tkt Cancelled. PNR, 12136726454, Amt 521 will be refunded in your account. - info
* KSRTC m-Ticket *  from: KOLHAPUR to: THRISSUR PsngrName: Clayton TripCode: 1446KOLTHR PNR No.: J59643129 JnyDate: 2015-03-05 DepTime: 14:46  SeatNo.: 16, 19, 50, 51, 21  Class: AC chair  BoardingPt: KOLHAPUR Residency Road Txn Password: 8274 . Please carry your photo ID during journey. T&C apply. Visit  www.ksrtc.in - info
Next Thursday at 9 pm - info
PNR:7214716784,TRAIN:5676,DOJ:2015-08-19,FC,BF-BMC,Dep:21:42, Trenton,F2 59 , Fare:2005,SC:10.0+PG+INS - info


In [41]:
# model checkpoints path
checkpoints_path = '/content/annotations_data/checkpoints/lora-text-classification' # Change path, Saving for each 500, delete few checkpoints from drive to save storage

# if not exists, create directory
if not os.path.exists(checkpoints_path):
  os.makedirs(checkpoints_path)

**Fine-tuning with LoRA**

PEFT involves augmenting a base model with a relatively small number of trainable parameters.

In [56]:
# peft config
peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
                        r=4, # intrinsic rank of trainable weight matrix
                        lora_alpha=32, # this is like a learning rate
                        lora_dropout=0.01, # probablity of dropout
                        target_modules = ['q_lin']) # we apply lora to query layer only
# New version of our model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 638,991 || all params: 67,603,998 || trainable%: 0.9452


In [58]:
# Training argumnets - hyper parameters
# training - batch size-> 32 low data, else 16 => number of examples processed per optimziation step
# epochs-40 for large data else 100-120 => number of times model runs through training data
# lr -> 2e-5  size of optimization step

training_args = TrainingArguments(
    output_dir=checkpoints_path,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy = "epoch",
    logging_strategy="epoch")

# trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)

# training
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9207,0.457318,0.704949
2,0.4586,0.429389,0.711986
3,0.4385,0.420192,0.708289
4,0.4324,0.416884,0.712701
5,0.4288,0.415249,0.710793


TrainOutput(global_step=10480, training_loss=0.5358155112230141, metrics={'train_runtime': 1449.7703, 'train_samples_per_second': 115.66, 'train_steps_per_second': 7.229, 'total_flos': 5636608884080640.0, 'train_loss': 0.5358155112230141, 'epoch': 5.0})

**Generate prediction**

In [63]:
model.to("cpu")
print("Trained model predictions:")
print("--------------------------")
for text in test_untrained:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
OTP is 667778 for the txn of INR 3721.00 at True Value on your AXIS bank CREDIT card ending with 5474. Valid till 6:38:29. Do not share the OTP with anyone for security reasons - spam
Your Tkt Cancelled. PNR, 12136726454, Amt 521 will be refunded in your account. - spam
* KSRTC m-Ticket *  from: KOLHAPUR to: THRISSUR PsngrName: Clayton TripCode: 1446KOLTHR PNR No.: J59643129 JnyDate: 2015-03-05 DepTime: 14:46  SeatNo.: 16, 19, 50, 51, 21  Class: AC chair  BoardingPt: KOLHAPUR Residency Road Txn Password: 8274 . Please carry your photo ID during journey. T&C apply. Visit  www.ksrtc.in - Bus
Next Thursday at 9 pm - ham
PNR:7214716784,TRAIN:5676,DOJ:2015-08-19,FC,BF-BMC,Dep:21:42, Trenton,F2 59 , Fare:2005,SC:10.0+PG+INS - Train


**Load Peft Model**

In [67]:
# how to load peft model from hub for inference
model_id = "/content/annotations_data/checkpoints/lora-text-classification/checkpoint-10480"  # checkpoints_path
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=15, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, model_id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
model.to("cpu")
print("Trained model predictions:")
print("--------------------------")
for text in test_untrained:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
OTP is 667778 for the txn of INR 3721.00 at True Value on your AXIS bank CREDIT card ending with 5474. Valid till 6:38:29. Do not share the OTP with anyone for security reasons - spam
Your Tkt Cancelled. PNR, 12136726454, Amt 521 will be refunded in your account. - spam
* KSRTC m-Ticket *  from: KOLHAPUR to: THRISSUR PsngrName: Clayton TripCode: 1446KOLTHR PNR No.: J59643129 JnyDate: 2015-03-05 DepTime: 14:46  SeatNo.: 16, 19, 50, 51, 21  Class: AC chair  BoardingPt: KOLHAPUR Residency Road Txn Password: 8274 . Please carry your photo ID during journey. T&C apply. Visit  www.ksrtc.in - Bus
Next Thursday at 9 pm - ham
PNR:7214716784,TRAIN:5676,DOJ:2015-08-19,FC,BF-BMC,Dep:21:42, Trenton,F2 59 , Fare:2005,SC:10.0+PG+INS - Train


In [66]:
!zip -r "file.zip" "/content/annotations_data/checkpoints/lora-text-classification/checkpoint-10480"

  adding: content/annotations_data/checkpoints/lora-text-classification/checkpoint-10480/ (stored 0%)
  adding: content/annotations_data/checkpoints/lora-text-classification/checkpoint-10480/adapter_config.json (deflated 55%)
  adding: content/annotations_data/checkpoints/lora-text-classification/checkpoint-10480/rng_state.pth (deflated 25%)
  adding: content/annotations_data/checkpoints/lora-text-classification/checkpoint-10480/README.md (deflated 66%)
  adding: content/annotations_data/checkpoints/lora-text-classification/checkpoint-10480/special_tokens_map.json (deflated 42%)
  adding: content/annotations_data/checkpoints/lora-text-classification/checkpoint-10480/vocab.txt (deflated 53%)
  adding: content/annotations_data/checkpoints/lora-text-classification/checkpoint-10480/trainer_state.json (deflated 69%)
  adding: content/annotations_data/checkpoints/lora-text-classification/checkpoint-10480/adapter_model.safetensors (deflated 7%)
  adding: content/annotations_data/checkpoints/l