# SMS Classification



In [1]:
import pandas as pd
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer
from nlp import load_dataset, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

In [2]:
sms = pd.read_csv("../corpus/sg_sms_corpus_en.csv")

In [3]:
sms.shape

(48092, 2)

In [4]:
sms.head()

Unnamed: 0,user_id,text
0,0,Bugis oso near wat...
1,0,"Go until jurong point, crazy.. Available only ..."
2,0,I dunno until when... Lets go learn pilates...
3,0,Den only weekdays got special price... Haiz......
4,0,Meet after lunch la...


In [5]:
# unique user_id
sms.user_id.nunique()

199

In [6]:
sms.user_id.value_counts()

129    4706
9      4683
105    3501
102    2548
126    1936
       ... 
152      10
140      10
172      10
183      10
191      10
Name: user_id, Length: 199, dtype: int64

In [7]:
# filter for the top 3 users for classification
sms = sms[sms.user_id.isin([9, 105, 129])].reset_index(drop=True)

In [9]:
# sample 5 rows for each user, can you spot the difference?
for user_id in sms.user_id.unique():
    print(f">>> User id : {user_id}")
    print("\n".join(sms.loc[sms.user_id == user_id, "text"].sample(5, random_state=0).tolist()))
    print()

>>> User id : 9
Look c... Buying but not so soon...
m always like tis...pain u see..worst den monthly cramps!haha...
What are you doing now ?
Hey you on your way?
u r more naggin thn my mum... ok, but i still hav to do my proj rite. how i wish i can go home now.. thn u goin alone ah? Ur bil1 not goin wf u?

>>> User id : 105
See can find people anot first eh.
Haha sorryi just saw yourmessage.
Okay  <#> pm my house.
Hahaha still havent reach home.
No carrot cake. Bought satay beehoon for you.

>>> User id : 129
Ohh okay!:P hahaha yeah the bride quite chio! Yes omg I watch untilthere liao yucks. Disgusting! That guy tasted it!!!! Yucks!!!!!!  Ohhyeah I kena taupok by shahid before, forgot who else! It was err veryrandom ah, during council chalet! Chris they all also taupok-ed me!Hahaha ooo! Tuna??
Haha yup^^ sian I just got your msg! Haiz rest well my dear! (: yeapcan't wait!!!
It's nice!!
He's a very very weird guy, abit embarrassing:/ hahaha yeah youdefinitely won't recognise any of th

It seems like the 3rd user is more distinctive with longer sentence and words, and more usage of exclamation mark.

In [10]:
# remap user_id
sms.user_id.replace([9, 105, 129], [0, 1, 2], inplace=True)

In [11]:
# stratified sampling for train-test split
sms_train = pd.DataFrame()
for user_id in sms.user_id.unique():
    sms_train = pd.concat([sms_train, sms[sms.user_id == user_id].sample(frac=0.6)])

In [12]:
sms_train.shape

(7735, 2)

In [13]:
sms_test = sms.drop(sms_train.index)

In [14]:
sms_train.reset_index(drop=True, inplace=True)
sms_test.reset_index(drop=True, inplace=True)

In [15]:
sms_test.shape

(5155, 2)

In [16]:
# make sure no overlap of sms in train-test
sms_train.text.isin(sms_test.text).sum()

0

In [17]:
train_dataset = Dataset.from_pandas(sms_train)
test_dataset = Dataset.from_pandas(sms_test)

In [18]:
train_dataset = train_dataset.map(lambda examples: {"label": examples["user_id"]}, batched=True)
test_dataset = test_dataset.map(lambda examples: {"label": examples["user_id"]}, batched=True)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [19]:
# allow up to 10 mins to download the model when running for the first time
tokenizer = BertTokenizer.from_pretrained("zanelim/singbert-large-sg")
model = BertForSequenceClassification.from_pretrained("zanelim/singbert-large-sg", 
                                                      num_labels=sms_train.user_id.nunique())

Some weights of the model checkpoint at zanelim/singbert-large-sg were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [20]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

In [21]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [22]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [23]:
# freeze weights of pre-trained model
for param in model.base_model.parameters():
    param.requires_grad = False

In [24]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [29]:
training_args = TrainingArguments(
    output_dir="../data/sms_classification",
    num_train_epochs=15,
    per_device_train_batch_size=200,
    per_device_eval_batch_size=64,
    warmup_steps=300,
    weight_decay=0.01,
    logging_dir="../data/sms_classification",
    save_steps=1000,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [30]:
# this will take around 30 mins on 2 GPUs
trainer.train()

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=15.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…






HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…





TrainOutput(global_step=300, training_loss=1.0349915971358616)

In [31]:
trainer.evaluate()

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=41.0, style=ProgressStyle(description_wi…






{'eval_loss': 0.911719897898232,
 'eval_accuracy': 0.6259941804073715,
 'eval_f1': 0.6153501909755232,
 'eval_precision': 0.6131202312817052,
 'eval_recall': 0.6259941804073715,
 'epoch': 15.0}

In [33]:
trainer.save_model()

In [32]:
preds = trainer.predict(test_dataset)

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


HBox(children=(FloatProgress(value=0.0, description='Prediction', max=41.0, style=ProgressStyle(description_wi…






In [34]:
print(classification_report(preds.label_ids, preds.predictions.argmax(-1)))

              precision    recall  f1-score   support

           0       0.67      0.73      0.70      1873
           1       0.49      0.35      0.41      1400
           2       0.65      0.72      0.68      1882

    accuracy                           0.63      5155
   macro avg       0.60      0.60      0.60      5155
weighted avg       0.61      0.63      0.62      5155



In [40]:
pd.DataFrame(confusion_matrix(preds.label_ids, preds.predictions.argmax(-1)), 
             index=range(3), columns=range(3))

Unnamed: 0,0,1,2
0,1374,217,282
1,447,493,460
2,221,301,1360


As expected, the model does a much better job in classifying the 1st and 3rd user due to their distinctiveness

In [43]:
y_test_true = preds.predictions.argmax(-1)
y_test_pred = sms_test.user_id.values
y_test = pd.DataFrame({"true": y_test_true, "pred": y_test_pred})

In [49]:
# let's look at the examples where the model is correct for the 1st user
sms_test.loc[(y_test.true == 0) & (y_test.pred == 0), "text"].sample(10).tolist()

['Wah...News really spread fast...Haha tonite dun nid formal rite?',
 'then, come back more often to c grandma when u r free, take care...',
 'HI LADY PLSE',
 'Oh... Lk tt ah... Wat kind of jobs u wan... Waitress or office, i help u look out..',
 'L CALL U',
 'We meet at 1115 instead can? Btw, ü wanna go for my hall bash in embassy on e 18?',
 'Yup... Centre pt all small ones leh... 300 pieces.',
 'Really... Thgt u oredi booked tt lesson liao... Haha, gd...',
 'I thk tmr still got one more lecture... I got no more tut so thk i wun b going to school...',
 'Wah u so gd... Only 6 pts...']

In [50]:
# let's look at the examples where the model is incorrect for the 1st user
sms_test.loc[(y_test.true != 0) & (y_test.pred == 0), "text"].sample(10).tolist()

['ü giving tuition rite? Ya applied for local ü',
 'No lah. C u tom then ü',
 'm so sweet rite.. i bought choc cake for xin, yummy..',
 'Hi! How was your weekend? Did you have a good time?  :)',
 'U still outside w friends?',
 'Haha cant help much.',
 'he said it is quite a good chance to practice before the real presentation lor, you might want to think about it.',
 'Haha... So is he handsomer?',
 'Erm wat break....? I think yr jc guy frens in army had a break last week rite?Hmmm, i think a nite job is quite suitable for ü.... Hee hee.',
 'm on my way liao...']

In [51]:
# let's look at the examples where the model is correct for the 2nd user
sms_test.loc[(y_test.true == 1) & (y_test.pred == 1), "text"].sample(10).tolist()

['Hahahaha nvm la.',
 ':-( neh mind',
 '<#> pm eh. Help me tell xt eh.',
 'Outside. Why?',
 'Nah I will leave yours out.',
 'Any bubble tea for you?',
 'Yo you looking for me?',
 "Yeah, quite troublesome, but it's a one time thing mah.",
 "Oh ya. Can come down  <#> '",
 'Yup im still in school.']

In [52]:
# let's look at the examples where the model is incorrect for the 2nd user
sms_test.loc[(y_test.true != 1) & (y_test.pred == 1), "text"].sample(10).tolist()

["Haha when's your exam?",
 'Haha eating with who?',
 'Yes, burden off my shoulders! Haha',
 'Mahjong tonight? Cally not free, see if you can find players.',
 'Wah so good lol. Did he like use that to bao your red packet?! Lol',
 'Lol what order is that?! Lol sugar level?',
 "Yo ladies, if the movie changed to wednesday night, who's on?",
 "Hhahaha or probably you have not talk to me for the past few days,that's why the fever came. Nvm now im here already hahahaha",
 'Oh im at clubhouse. Later meeting at lt <#> a. I will be there by 6pm.',
 'Yup to replenish your sodium. Should probably try some chivas too. XD']

In [53]:
# let's look at the examples where the model is correct for the 3rd user
sms_test.loc[(y_test.true == 2) & (y_test.pred == 2), "text"].sample(10).tolist()

['Leaving amk! About half an hour bah! If you want can go cineleisure first!',
 'Boo! Haha thanks for the chat:P have a safe trip home!',
 'Haha ohh no super unglam!!!! ><next time dun laugh in front of youle! Hey you want to take 2 star on  <DATE>  oct? Jiayi Spencer JensSophie going most probably, can ask Lewis Nicky yuhan they all too!',
 'Are you still here? Hehe I have sth to pass you:P',
 'LOL AWESOME WHY HE SO WULIAO HAHAHA OKAY OUCH THAT IS PAINFUL I HOPETHERE IS NO HOPE IN YOUR FOOT RIGHT NOW',
 'Anyway go sch with me tmr morning!! :D I can take  <#>  go meet youthen we take  <#> !',
 "It's not!! I'm back!",
 "Haha uhh okay ah! he's a senior! His hairstyle needs to change though LOL.",
 "Haha ok ok ah I think but still won't do very well bah! Not sure leh!I felt like I could answer but then dunno if I missed anything outhaha:/ going to bathe now!",
 'Aw man haha fine:P see you later then!! Enjoy yourself ((:']

In [54]:
# let's look at the examples where the model is incorrect for the 3rd user
sms_test.loc[(y_test.true != 2) & (y_test.pred == 2), "text"].sample(10).tolist()

['Ohh x.x haha okay! Thanks:D do you think I can put my bag there?',
 'Oh crap:x!!!!!7',
 "Hihi ppl wanna volunteer to help out for a swimming event? On 4 & 5Nov, 7 am to 9 pm with 5 hours break in between on both days plus acompulsory training session on  <#>  Oct  <#>  -  <#> . Can get twoevent tshirts &  <#>  bucks! Reply with your nric, house address,emergency contact (name, number, relationship) plus your allergies!Let's volunteer tgt, so we sign up for uhh f&b and VIP room (1stchoice) or ushers (2nd) coz hehe I think these two move ard less bahand not so stress! Please reply if you're interested and jio anyoneelse! I need replies by tonight! I sent this to abbie, vivien, sophie,friend, Ivan, huikang and wenbo, I dun have Jimmy's number!",
 'Okay, I go down now?:P',
 'Oh I dunno maybe he was attempting a raghav:P',
 'You pig!! Ok!!! Want do um on Wednesday???',
 'No! You!',
 "Hahaha confusing! And no i'm not!!!",
 'LOL I ALSO DUN KNOW HAHAHA',
 "Hahaha what I didn't see!!! What ha