# BERT Base 
Uncased Embeddings

Using bert uncased embeddings to find a representative word embedding of trivial and non-trivial messages (from the training sentences). Classify each new sentence based on the Euclidean distance from the representative embeddings.

In [1]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

import numpy as np
import tensorflow as tf

from scipy.spatial.distance import cosine

2023-04-30 10:39:26.191775: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
spam_df = pd.read_excel('../../src/data/spam.xlsx')
checkmate_df = pd.read_csv('../../src/data/checkmate_Table.csv')

In [13]:
checkmate_df['is_trivial'] = (checkmate_df['taggedCategory']=='Trivial')
# Not all sentences are imported as strings
checkmate_df['text'] = checkmate_df['text'].astype('str')

# Remove empty rows
spam_df = spam_df.iloc[:, 1:3]
# Rename columns
spam_df.columns = ['text', 'is_trivial']
# Not all sentences are imported as strings
spam_df['text'] = spam_df['text'].astype('str')

In [16]:
spam_df['text'] = (spam_df['text'].apply(lambda x: "[CLS] " + x + " [SEP]"))
# print(spam_df)
spam_df['token_ids'] = spam_df['text'].apply(lambda x: tokenizer.encode(x, truncation=True))
# print(spam_df)

checkmate_df['text'] = (checkmate_df['text'].apply(lambda x: "[CLS] " + x + " [SEP]"))
# print(checkmate_df)
checkmate_df['token_ids'] = checkmate_df['text'].apply(lambda x: tokenizer.encode(x, truncation=True))
# print(checkmate_df)

In [18]:
for i in spam_df['token_ids']:
    if (i) > 512:
        print('gth longer than 512')
for i in checkmate_df['token_ids']:
    if len(i) > 512:
        print('Length longer than 512')

In [20]:
# Creating segment ids for the sentence - since all belong to the same sentence it is all 1
checkmate_df['segment_ids'] = (checkmate_df['token_ids'].apply(lambda x: [1] * len(x)))
spam_df['segment_ids'] = (spam_df['token_ids'].apply(lambda x: [1] * len(x)))

In [24]:
temp = checkmate_df['segment_ids']
print(type(temp[0]))
print(temp[0])

<class 'list'>
[1, 1, 1, 1, 1]


In [25]:
# Turn all the list of token ids/segment ids into tensors
spam_df['tensor_token'] = spam_df.apply(lambda x: torch.tensor([x['token_ids']]), axis=1)
spam_df['tensor_segment'] = spam_df.apply(lambda x: torch.tensor([x['segment_ids']]), axis=1)

checkmate_df['tensor_token'] = checkmate_df.apply(lambda x: torch.tensor([x['token_ids']]), axis=1)
checkmate_df['tensor_segment'] = checkmate_df.apply(lambda x: torch.tensor([x['segment_ids']]), axis=1)

In [28]:
spam_df_train, spam_df_test = train_test_split(spam_df, test_size=0.8, stratify=spam_df['is_trivial'])
print(len(spam_df_train))
print(len(spam_df_test))

1114
4457


In [29]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [43]:
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():

    spam_df_train['hidden_states'] = spam_df_train.apply(lambda x: model(x['tensor_token'],x['tensor_segment'])[2], axis=1)
    spam_df_test['hidden_states'] = spam_df_test.apply(lambda x: model(x['tensor_token'],x['tensor_segment'])[2], axis=1)
    checkmate_df['hidden_states'] = checkmate_df.apply(lambda x: model(x['tensor_token'],x['tensor_segment'])[2], axis=1)

In [31]:
# `hidden_states` has shape [13 x 1 x number of tokens x 768]
# `token_vecs` is a tensor with shape [number of tokens x 768]
# [# layers, # batches, # tokens, # features]
spam_df_train['hidden_states'].head(10).apply(lambda x: print(x[-2][0].shape))

torch.Size([22, 768])
torch.Size([18, 768])
torch.Size([46, 768])
torch.Size([43, 768])
torch.Size([17, 768])
torch.Size([36, 768])
torch.Size([30, 768])
torch.Size([119, 768])
torch.Size([19, 768])
torch.Size([13, 768])


4624    None
3844    None
5175    None
4985    None
2730    None
4451    None
4278    None
2600    None
2459    None
4593    None
Name: hidden_states, dtype: object

In [44]:
# Get sentence embedding by taking the mean of feature tensor of all the tokens in the second last layer
spam_df_train['sentence_embedding'] = spam_df_train['hidden_states'].apply(lambda x: torch.mean(x[-2][0], dim=0))
spam_df_test['sentence_embedding'] = spam_df_test['hidden_states'].apply(lambda x: torch.mean(x[-2][0], dim=0))
checkmate_df['sentence_embedding'] = checkmate_df['hidden_states'].apply(lambda x: torch.mean(x[-2][0], dim=0))

In [45]:
# separate the sentence embeddings of non trivial and trivial messages
spam_df_train_not_trivial = spam_df_train[spam_df_train['is_trivial'] == False]
spam_df_train_trivial = spam_df_train[spam_df_train['is_trivial'] == True]

In [46]:
# create tensors from the series spam_df_train['sentence_embedding]
tensor_train_not_trivial = torch.stack([t for t in spam_df_train_not_trivial['sentence_embedding']])
tensor_train_trivial = torch.stack([t for t in spam_df_train_trivial['sentence_embedding']])

In [47]:
# create a representative tensor of size [1,768] for non-trivial and trivial
rep_not_trivial = torch.mean(tensor_train_not_trivial, dim=0)
rep_trivial = torch.mean(tensor_train_trivial, dim=0)

In [48]:
def classify_trivial(v):
    distance_from_not_trivial = cosine(rep_not_trivial, v)
    distance_from_trivial = cosine(rep_trivial, v)
    if distance_from_trivial > distance_from_not_trivial:
        return False
    else: 
        return True

### Evaluate
Get results of using the cosine similarity method on both spam_df_test and checkmate_df

In [51]:
spam_df_test['prediction'] = spam_df_test['sentence_embedding'].apply(lambda x: classify_trivial(x))
checkmate_df['prediction'] = checkmate_df['sentence_embedding'].apply(lambda x: classify_trivial(x))

#### Results for spam_df_test

In [52]:
spam_predicted_vs_output = pd.concat([spam_df_test['text'], spam_df_test['is_trivial'], spam_df_test['prediction']], axis=1)
# predicted_vs_output = predicted_vs_output.rename(columns={'is_trivial':'target', 'text':'prediction'})
spam_predicted_vs_output.columns = ['text', 'target','prediction']

print(spam_predicted_vs_output)

                                                   text  target  prediction
4397  [CLS] Good morning, im suffering from fever an...       1        True
2758    [CLS] Is ur paper today in e morn or aft? [SEP]       1        True
262   [CLS] Where are the garage keys? They aren't o...       1        True
5061  [CLS] URGENT! We are trying to contact U. Toda...       0       False
2617            [CLS] Aight, lemme know what's up [SEP]       1        True
...                                                 ...     ...         ...
3944                [CLS] Think + da. You wil do. [SEP]       1        True
3156  [CLS] Hey babe, my friend had to cancel, still...       1        True
3457             [CLS] I am on the way to ur home [SEP]       1        True
1937  [CLS] I am getting threats from your sales exe...       1        True
5139      [CLS] 88066 FROM 88066 LOST 3POUND HELP [SEP]       0       False

[4457 rows x 3 columns]


In [53]:
# Calculate the recall of this methodology
true_trivial = len(spam_predicted_vs_output[(spam_predicted_vs_output['target']==True) & (spam_predicted_vs_output['prediction']==True)])
true_non_trivial = len(spam_predicted_vs_output[(spam_predicted_vs_output['target']==False) & (spam_predicted_vs_output['prediction']==False)])

false_non_trivial = len(spam_predicted_vs_output[(spam_predicted_vs_output['target']==True) & (spam_predicted_vs_output['prediction']==False)])
false_trivial = len(spam_predicted_vs_output[(spam_predicted_vs_output['target']==False) & (spam_predicted_vs_output['prediction']==True)])

recall = true_non_trivial/(true_non_trivial+false_trivial)
accuracy = (true_trivial+true_non_trivial)/(true_trivial+true_non_trivial+false_trivial+false_non_trivial)

In [58]:
print(f'Recall is: {recall*100}%')
print(f'Accuracy is: {accuracy*100}%')

print(f'true_trivial is: {true_trivial}')
print(f'true_non_trivial is: {true_non_trivial}')
print(f'false_non_trivial is: {false_non_trivial}')
print(f'false_trivial is: {false_trivial}')

Recall is: 95.04950495049505%
Accuracy is: 97.24029616333857%
true_trivial is: 3758
true_non_trivial is: 576
false_non_trivial is: 93
false_trivial is: 30


#### Results for checkmate_df

In [59]:
checkmate_predicted_vs_output = pd.concat([checkmate_df['text'], checkmate_df['is_trivial'], checkmate_df['prediction'], checkmate_df['taggedCategory']], axis=1)
# predicted_vs_output = predicted_vs_output.rename(columns={'is_trivial':'target', 'text':'prediction'})
checkmate_predicted_vs_output.columns = ['text', 'target','prediction','category']

print(checkmate_predicted_vs_output)

                                                 text  target  prediction  \
0                                     [CLS] nan [SEP]   False        True   
1   [CLS] https://www.mas.gov.sg/news/media-releas...   False       False   
2   [CLS] TN 95546718362782 is out for del. Allow ...   False       False   
3   [CLS] 🚩🚩🚩 *"You flag, we check"* 🔍🔍🔍\n\nNot su...   False       False   
4   [CLS] https://form.gov.sg/63f594b42413ea001183...   False       False   
..                                                ...     ...         ...   
81  [CLS] [SHIN MIN CONTEST] Happycall Jumbo 双面锅等你...   False       False   
82  [CLS] Hello, sorry to bother you, I'm Nico fro...   False       False   
83  [CLS] Hello, my name is Sarah, from CME Group....   False        True   
84  [CLS] LTA: Notice As no valid E-tag detected i...   False       False   
85  [CLS] Excuse me, this is Stella, have you arra...   False        True   

             category  
0   Info/News/Opinion  
1   Info/News/Opinion  
2  

In [64]:
# Calculate the recall of this methodology
true_trivial = (checkmate_predicted_vs_output[(checkmate_predicted_vs_output['target']==True) & (checkmate_predicted_vs_output['prediction']==True)])
true_non_trivial = (checkmate_predicted_vs_output[(checkmate_predicted_vs_output['target']==False) & (checkmate_predicted_vs_output['prediction']==False)])

false_non_trivial = (checkmate_predicted_vs_output[(checkmate_predicted_vs_output['target']==True) & (checkmate_predicted_vs_output['prediction']==False)])
false_trivial = (checkmate_predicted_vs_output[(checkmate_predicted_vs_output['target']==False) & (checkmate_predicted_vs_output['prediction']==True)])

recall = len(true_non_trivial)/(len(true_non_trivial)+len(false_trivial))
accuracy = (len(true_trivial)+len(true_non_trivial))/(len(true_trivial)+len(true_non_trivial)+len(false_trivial)+len(false_non_trivial))

In [65]:
print(f'Recall is: {recall*100}%')
print(f'Accuracy is: {accuracy*100}%')

print(f'true_trivial is: {len(true_trivial)}')
print(f'true_non_trivial is: {len(true_non_trivial)}')
print(f'false_non_trivial is: {len(false_non_trivial)}')
print(f'false_trivial is: {len(false_trivial)}')

Recall is: 76.19047619047619%
Accuracy is: 81.3953488372093%
true_trivial is: 22
true_non_trivial is: 48
false_non_trivial is: 1
false_trivial is: 15


In [66]:
print(false_non_trivial)

                                                 text  target  prediction  \
72  [CLS] If you receive a scam message like this ...    True       False   

   category  
72  Trivial  


In [70]:
print(false_trivial)

                                                 text  target  prediction  \
0                                     [CLS] nan [SEP]   False        True   
8                                     [CLS] nan [SEP]   False        True   
12  [CLS] Hey Jolyn! We haven't seen you in the st...   False        True   
19  [CLS] Hi, last year one of my colleague met yo...   False        True   
22  [CLS] Hi! I pray this msg find you in good hea...   False        True   
33  [CLS] Ionizing radiation can affect the atoms ...   False        True   
37   [CLS] I can withdraw my CPF only at age 65 [SEP]   False        True   
38                 [CLS] 9/11 attack was a scam [SEP]   False        True   
41  [CLS] Hello \nsorry to bother you, is this Kev...   False        True   
48                                    [CLS] nan [SEP]   False        True   
51  [CLS] Hi, I'm. Felicia.  from *SG, Employment ...   False        True   
66  [CLS] Hi there, lovely Evening, we are current...   False        True   

## Learnings/Room for improvement

Out of the 15 wrongly classified messages:
2 - nan values
    - One is an image
    - One is an empty nan messsage

Solution: Automatically classify nan of images as non-trivial


Types of these messages
6 - unsure values
4 - scam
4 - info 
1 - legitimate

Too strict criteria of deciding non-trivial - most of the non-trivial predicitions are correct. Need to be more likely to predict messages as non-trivial instead
However, a higher percentage of the trivial predicitions were wrong.
Based on the requirements of the model, it is better to be strict on deciding criteria on whether something is deemed trivial

Next steps:

- Test different methods of determining sentence embedding
- Classificiation model layer on top of the sentence embedding