In [216]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### **Libraries**

In [217]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

import math
import random

import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

import sklearn
from sklearn.metrics import roc_auc_score

import transformers
from transformers import BertTokenizer, AdamW

### **Data**

In [218]:
class SMSdataset(Dataset):
    def __init__(self, messages, label, tokenizer, max_len):
        self.messages = messages
        self.label = label
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.messages)

    def __getitem__(self, idx):
        sms = str(self.messages[idx])
        label = self.label[idx]

        encoding = self.tokenizer.encode_plus(
            sms,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'sms': sms,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [219]:
df1 = pd.read_csv('https://raw.githubusercontent.com/animesharma3/SPAM-SMS-Detection/master/spam_sms_collection.csv')[['msg', 'spam']]
df1 = df1.rename(columns={'msg': 'text', 'spam': 'label'})

df2 = pd.read_csv('/content/train.csv').reset_index()
df2 = df2.drop(columns={'index', 'id'})

df3 = pd.read_csv('/content/test.csv').reset_index()
df3 = df3.drop(columns={'index', 'id'})

df = pd.concat((df1, df2, df3))
df = df.fillna(-1)
df = df.reset_index().drop(columns={'index'})
# df = df.sample(frac=1).reset_index().drop(columns={'index'})

In [220]:
df.head()

Unnamed: 0,text,label
0,go jurong point crazy available bugis n great ...,0.0
1,ok lar joking wif u oni,0.0
2,free entry wkly comp win fa cup final tkts st ...,1.0
3,u dun say early hor u c already say,0.0
4,nah think go usf life around though,0.0


In [221]:
max_len = 120
data = SMSdataset(df['text'].values,
                  df['label'].values.astype(np.compat.long),
                  tokenizer,
                  max_len=max_len
                  )

In [222]:
data.tokenizer.vocab_size

28996

### **BERT configuration**

In [223]:
class Config():
    def __init__(self, tok_size=None, 
                 d_model=256,
                 n_layers=12,
                 n_heads=8,
                 n_segments=2,
                 p_drop_hidden=0.3,
                 p_drop_attn=0.3,
                 max_len=256,
                 device=None):
        
        self.tok_size = tok_size
        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.n_segments = n_segments
        self.p_drop_hidden = p_drop_hidden
        self.p_drop_attn = p_drop_attn
        self.max_len = max_len
        self.device = device
    
    def show_params(self):
        print('----------------------')
        print('token size (tok_size):', self.tok_size)
        print('hidden dimension (d_model):', self.d_model)
        print('number layers (n_layers):', self.n_layers)
        print('number heads (n_heads):', self.n_heads)
        print('number segments (n_segments):', self.n_segments)
        print('dropout prob hidden (p_drop_hidden):', self.p_drop_hidden)
        print('dropout prob attention (p_drop_attn):', self.p_drop_attn)
        print('max lenght (max_len):', self.max_len)
        print('device (device):', self.device)
        print('----------------------')

cfg = Config(tok_size=data.tokenizer.vocab_size, 
             d_model=300,
             n_layers=12,
             n_heads=6,
             n_segments=2,
             p_drop_hidden=0.1,
             p_drop_attn=0.1,
             max_len=max_len,
             device=torch.device('cuda' if torch.cuda.is_available() 
                                                 else 'cpu'))

cfg.show_params()

----------------------
token size (tok_size): 28996
hidden dimension (d_model): 300
number layers (n_layers): 12
number heads (n_heads): 6
number segments (n_segments): 2
dropout prob hidden (p_drop_hidden): 0.1
dropout prob attention (p_drop_attn): 0.1
max lenght (max_len): 120
device (device): cuda
----------------------


In [224]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()

        self.tok_embed = nn.Embedding(cfg.tok_size, cfg.d_model)
        self.pos_embed = nn.Embedding(cfg.max_len, cfg.d_model)

        self.positions = torch.ones(cfg.max_len, dtype=torch.long).to(cfg.device)

        self.layer_norm = nn.LayerNorm(cfg.d_model)

    def forward(self, tokens):
        
        tokens_embedding = self.tok_embed(tokens)
        positions_embedding = self.pos_embed(self.positions)

        return self.layer_norm(tokens_embedding + positions_embedding)

In [225]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = torch.nn.functional.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

In [226]:
class MultiheadAttention(nn.Module):

    def __init__(self):
        super().__init__()
        assert cfg.d_model % cfg.n_heads == 0

        self.embed_dim = cfg.d_model
        self.n_heads = cfg.n_heads
        self.head_dim = cfg.d_model // cfg.n_heads

        self.qkv_proj = nn.Linear(cfg.d_model, 3*cfg.d_model)
        self.o_proj = nn.Linear(cfg.d_model, cfg.d_model)

        self._reset_parameters()

    def _reset_parameters(self):
  
        nn.init.xavier_uniform_(self.qkv_proj.weight)
        self.qkv_proj.bias.data.fill_(0)
        nn.init.xavier_uniform_(self.o_proj.weight)
        self.o_proj.bias.data.fill_(0)

    def forward(self, x, mask=None, return_attention=False):
        batch_size, seq_length, _ = x.size()
        qkv = self.qkv_proj(x)

        qkv = qkv.reshape(batch_size, seq_length, self.n_heads, 3*self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)

        values, attention = scaled_dot_product(q, k, v, mask=mask)
        values = values.permute(0, 2, 1, 3)
        values = values.reshape(batch_size, seq_length, self.embed_dim)
        o = self.o_proj(values)

        if return_attention:
            return o, attention
        else:
            return o

In [227]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        
        self.attention = MultiheadAttention()

    def forward(self, embedding, attn_mask):
        
        output = self.attention.forward(embedding, attn_mask)

        return output

In [228]:
def gelu(x):
   return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [229]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()

        self.layers = nn.ModuleList([Encoder() for _ in range(cfg.n_layers)])
        self.embedding = Embedding()
        self.clf = nn.Linear(cfg.d_model, 2)
        self.act1 = nn.Sigmoid()

    def forward(self, tokens, attn_mask):
        output = self.embedding(tokens)

        for layer in self.layers:
            output = layer.forward(output, attn_mask)

        attn = output
        output = self.clf(torch.sum(self.act1(output), 1))
        
        return output, attn

In [230]:
model = BERT().to(cfg.device)

In [231]:
optimizer = optim.AdamW(model.parameters(), lr=0.00001)

criterion = nn.CrossEntropyLoss()
# import torch.optim.lr_scheduler.ExponentialLR as ExponentialLR
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.2)
# scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.5, total_iters=4)

In [232]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,070,402 trainable parameters


In [233]:
def masked(ids):
    masked_ids = []
    for i in range(1, len(ids[0]) - 1):
        if random.random() < 0.3 and ids[0][i] != 0:
            masked_ids.append(ids[0][i])
            ids[0][i] = 103
        else:
            masked_ids.append(0)
    return ids, masked_ids

In [234]:
max_epochs = 20

for epoch in range(max_epochs):
    epoch_loss = 0
    for i in range(len(data)):
        if data[i]['label'] != -1:
            if (i + 1) % 1000 == 0:
                print(i)

            optimizer.zero_grad()

            ids = data[i]['input_ids'].view(1, -1).to(cfg.device).clone()
            ids, masked_ids = masked(ids)

            out, attn = model.forward(ids, 
                data[i]['attention_mask'].type(torch.bool).to(cfg.device).view(-1, 1))

            loss = criterion(out, data[i]['label'].to(cfg.device).view(-1))

            

            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()
    # scheduler.step()
    # print(scheduler.get_last_lr())

    print('epoch:', epoch)
    print('loss:', epoch_loss / len(data))

999
1999
2999
3999
4999
5999
6999
7999
8999
epoch: 0
loss: 0.3268008680670904
999
1999
2999
3999
4999
5999
6999
7999
8999
epoch: 1
loss: 0.18081238640706201
999
1999
2999
3999
4999
5999
6999
7999
8999
epoch: 2
loss: 0.1268891659088191


KeyboardInterrupt: ignored

In [235]:
preds = []

model.eval()
for i in range(7000, len(data)):
    if data[i]['label'] == -1:
        out, attn = model.forward(data[i]['input_ids'].view(1, -1).to(cfg.device), 
            data[i]['attention_mask'].type(torch.bool).to(cfg.device).view(-1, 1))
        out = out.argmax()
        preds.append(out.detach().item())

real = pd.read_csv('/content/preds_with_sklearn.csv')
roc_auc_score(real['Predicted'].values, preds)

0.9055301288447355

In [236]:
ans = pd.DataFrame({'Id': real['Id'].values, 'Predicted': preds})

In [237]:
ans.to_csv('ans.csv', index = False)