# SMS Classification



In [1]:
import pandas as pd
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer
from nlp import load_dataset, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

In [2]:
sms = pd.read_csv("../corpus/sg_sms_corpus_en.csv")

In [3]:
sms.shape

(48092, 2)

In [4]:
sms.head()

Unnamed: 0,user_id,text
0,0,Bugis oso near wat...
1,0,"Go until jurong point, crazy.. Available only ..."
2,0,I dunno until when... Lets go learn pilates...
3,0,Den only weekdays got special price... Haiz......
4,0,Meet after lunch la...


In [5]:
# unique user_id
sms.user_id.nunique()

199

In [6]:
sms.user_id.value_counts()

129    4706
9      4683
105    3501
102    2548
126    1936
       ... 
152      10
140      10
172      10
183      10
191      10
Name: user_id, Length: 199, dtype: int64

In [7]:
# filter for the top 3 users for classification
sms = sms[sms.user_id.isin([9, 105, 129])].reset_index(drop=True)

In [8]:
# sample 5 rows for each user, can you spot the difference?
for user_id in sms.user_id.unique():
    print("\n".join(sms.loc[sms.user_id == user_id, "text"].sample(5, random_state=0).tolist()))
    print()

Look c... Buying but not so soon...
m always like tis...pain u see..worst den monthly cramps!haha...
What are you doing now ?
Hey you on your way?
u r more naggin thn my mum... ok, but i still hav to do my proj rite. how i wish i can go home now.. thn u goin alone ah? Ur bil1 not goin wf u?

See can find people anot first eh.
Haha sorryi just saw yourmessage.
Okay  <#> pm my house.
Hahaha still havent reach home.
No carrot cake. Bought satay beehoon for you.

Ohh okay!:P hahaha yeah the bride quite chio! Yes omg I watch untilthere liao yucks. Disgusting! That guy tasted it!!!! Yucks!!!!!!  Ohhyeah I kena taupok by shahid before, forgot who else! It was err veryrandom ah, during council chalet! Chris they all also taupok-ed me!Hahaha ooo! Tuna??
Haha yup^^ sian I just got your msg! Haiz rest well my dear! (: yeapcan't wait!!!
It's nice!!
He's a very very weird guy, abit embarrassing:/ hahaha yeah youdefinitely won't recognise any of them le! Haha okay have fun and takecare): tonight sle

it does seem like the 3rd user is more distinctive with more lengthy sentence and words

In [9]:
# remap user_id
sms.user_id.replace([9, 105, 129], [0, 1, 2], inplace=True)

In [10]:
# stratified sampling for train-test split
sms_train = pd.DataFrame()
for user_id in sms.user_id.unique():
    sms_train = pd.concat([sms_train, sms[sms.user_id == user_id].sample(frac=0.6)])

In [11]:
sms_train.shape

(7735, 2)

In [12]:
sms_test = sms.drop(sms_train.index)

In [13]:
sms_train.reset_index(drop=True, inplace=True)
sms_test.reset_index(drop=True, inplace=True)

In [14]:
sms_test.shape

(5155, 2)

In [15]:
# make sure no overlap of sms in train-test
sms_train.text.isin(sms_test.text).sum()

0

In [16]:
train_dataset = Dataset.from_pandas(sms_train)
test_dataset = Dataset.from_pandas(sms_test)

In [17]:
train_dataset = train_dataset.map(lambda examples: {"label": examples["user_id"]}, batched=True)
test_dataset = test_dataset.map(lambda examples: {"label": examples["user_id"]}, batched=True)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [18]:
# allow up to 10 mins to download the model when running for the first time
tokenizer = BertTokenizer.from_pretrained("zanelim/singbert-large-sg")
model = BertForSequenceClassification.from_pretrained("zanelim/singbert-large-sg", 
                                                      num_labels=sms_train.user_id.nunique())

Some weights of the model checkpoint at zanelim/singbert-large-sg were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [19]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

In [20]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [21]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [22]:
# freeze weights of pre-trained model
for param in model.base_model.parameters():
    param.requires_grad = False

In [23]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [24]:
training_args = TrainingArguments(
    output_dir="../data/sms_classification",
    num_train_epochs=10,
    per_device_train_batch_size=200,
    per_device_eval_batch_size=64,
    warmup_steps=300,
    weight_decay=0.01,
    logging_dir="../data/sms_classification",
    save_steps=5000,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [25]:
# this will take around 20 mins on 2 GPUs
trainer.train()

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…

  return function(data_struct)





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=20.0, style=ProgressStyle(description_wid…





TrainOutput(global_step=200, training_loss=1.0657428365945816)

In [26]:
trainer.evaluate()

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=41.0, style=ProgressStyle(description_wi…




{'eval_loss': 1.0017812339270986,
 'eval_accuracy': 0.5177497575169738,
 'eval_f1': 0.48164601384413686,
 'eval_precision': 0.4995844077405502,
 'eval_recall': 0.5177497575169738,
 'epoch': 10.0}

In [27]:
preds = trainer.predict(test_dataset)

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


HBox(children=(FloatProgress(value=0.0, description='Prediction', max=41.0, style=ProgressStyle(description_wi…






In [30]:
print(classification_report(preds.label_ids, preds.predictions.argmax(-1)))

              precision    recall  f1-score   support

           0       0.52      0.65      0.58      1873
           1       0.43      0.13      0.20      1400
           2       0.53      0.67      0.59      1882

    accuracy                           0.52      5155
   macro avg       0.49      0.49      0.46      5155
weighted avg       0.50      0.52      0.48      5155



In [32]:
print(confusion_matrix(preds.label_ids, preds.predictions.argmax(-1)))

[[1219  104  550]
 [ 628  185  587]
 [ 475  142 1265]]


As expected, the model does a much better job in classifying the 1st and 3rd user due to their distinctiveness