<a href="https://colab.research.google.com/github/zekaouinoureddine/Battle-of-Neighborhoods--Capstone/blob/master/BERT_Spanish_Corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers -q

[K     |████████████████████████████████| 2.1MB 8.2MB/s 
[K     |████████████████████████████████| 3.3MB 54.8MB/s 
[K     |████████████████████████████████| 901kB 37.3MB/s 
[?25h

In [2]:
# Linear algebra
import numpy as np

# Data processing
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

import transformers
from transformers import BertTokenizer, BertModel, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [3]:
torch.cuda.empty_cache()

### Dataset

In [4]:
class BERTDatasetTraining:
  def __init__(self, desc1, desc2, targets):
    self.desc1 = desc1
    self.desc2 = desc2
    self.targets = targets
    self.tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", do_lower_case=True)

  def __len__(self):
    return len(self.desc1)

  def __getitem__(self, item):
    desc1 = str(self.desc1[item])
    desc2 = str(self.desc2[item])

    desc1 = " ".join(desc1.split())
    desc2 = " ".join(desc2.split())

    inputs = self.tokenizer.encode_plus(
        desc1,
        desc2,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
    )

    ids = inputs["input_ids"]
    token_type_ids = inputs["token_type_ids"]
    mask = inputs["attention_mask"]

    return {
        "ids": torch.tensor(ids, dtype=torch.long),
        "mask": torch.tensor(mask, dtype=torch.long),
        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        "targets": torch.tensor(int(self.targets[item]), dtype=torch.long) if self.targets is not None else 0,
        }

In [5]:
df = pd.read_csv("/content/DescPairs.csv")

dataset = BERTDatasetTraining(df["desc1"].values,
                              df["desc2"].values,
                              df["is_duplicate"].values,
                              )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=247723.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=134.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=310.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=486125.0, style=ProgressStyle(descripti…




In [6]:
# Split data into train and valid subsets
train_data, valid_data = train_test_split(dataset, test_size = 0.2)


# DataLoader
## Train data
train_dataloader = DataLoader(train_data,
                              batch_size = 8,
                              num_workers = 4)

## Valid data
valid_dataloader = DataLoader(valid_data,
                              batch_size = 4,
                              num_workers = 1)

  cpuset_checked))


### Model

In [7]:
# class BERTBaseUncased(nn.Module):
#   def __init__(self):
#     super(BERTBaseUncased, self).__init__()
#     self.bert = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", return_dict=False)
#     self.bert_drop = nn.Dropout(0.2)
#     self.output = nn.Linear(768, 1)

#   def forward(self, ids, mask, token_type_ids):
#     '''BERT Outputs
#        o1 : Sequence output, for every token you will get a vector of size x = (768 if bert-base else 1024)
#        o2 : Pooled output, for every sequence you will get a vector of size x = (768 if bert-base else 1024)

#     '''
#     _, o2 = self.bert(ids = ids, attention_mask=mask, token_type_ids=token_type_ids)
#     o2 = self.bert_drop(o2)
#     o2 = self.output(o2)

#     return o2

class BERTBaseUncased(nn.Module):
  def __init__(self):
    super(BERTBaseUncased, self).__init__()
    self.bert = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", return_dict=False)
    self.dropout = nn.Dropout(0.2)
    self.out = nn.Linear(768, 1)

  def forward(self, ids, mask, token_type_ids):
    _, pooled = self.bert(ids, attention_mask=mask,token_type_ids=token_type_ids)
    
    # add dropout to prevent overfitting.
    pooled = self.dropout(pooled) 
    return self.out(pooled)

In [8]:
# Get GPU device name; PyTorch will use this GPU 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BERTBaseUncased().to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=650.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=439621341.0, style=ProgressStyle(descri…




Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Optimizer
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  correct_bias = False) # We chose to correct bias

# Total number of training steps is number of batchs
total_steps = len(train_dataloader)

# Create the learning rate Scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps=total_steps
)

In [10]:
# def loss_fn(outputs, targets):
#     outputs = torch.squeeze(outputs)
#     return nn.BCELoss()(nn.Sigmoid()(outputs.squeeze (1)), targets)


def loss_fn(outputs, labels):
  return nn.BCEWithLogitsLoss()(outputs, labels)

def train_fn(train_dataloader, model, optimizer, device, scheduler):
  model.train()

  for bi, d in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
    ids = d["ids"]
    token_type_ids = d["token_type_ids"]
    mask = d["mask"]
    labels = d["targets"]

    # Load data onto the GPU
    ids = ids.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype = torch.long)
    labels = labels.to(device, dtype = torch.float)

    # Clear out the gradients of the previous pass
    optimizer.zero_grad()

    # Forward & backward passes
    outputs = model(ids=ids,
                    mask=mask,
                    token_type_ids = token_type_ids
                    )
    
    loss = loss_fn(outputs, labels.float().unsqueeze(1))
    loss.backward()

    # Update the parameters 
    optimizer.step()

    # Track varaibles for monitoring progress
    # Update the learning rate
    scheduler.step()


def eval_fn(train_dataloader, model, device):
  model.eval()
  fin_labels = []
  fin_outputs = []
  with torch.no_grad():
    for bi, d in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
      ids = d["ids"]
      token_type_ids = d["token_type_ids"]
      mask = d["mask"]
      labels = d["targets"]

      # Load the data onto the GPU
      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.float)
      labels = labels.to(device, dtype=torch.float)

      # Forward pass
      outputs = model(ids=ids, 
                      token_type_ids=token_type_ids, 
                      mask=mask)
      fin_labels.extend(labels.cpu().detach().numpy().tolist())
      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
  return fin_outputs, fin_labels

In [14]:
EPOCHS = 4
best_accuracy = 0
for epoch in range(EPOCHS):
  train_fn(train_dataloader, model, optimizer, device, scheduler)
  outputs, targets = eval_fn(valid_dataloader, model, device)
  outputs = np.array(outputs) >= 0.5

  # Calculate the loss
  # loss = loss_fn(outputs, targets.unsqueeze(1))

  # Calculate the accuracy for each pass
  accuracy = metrics.accuracy_score(targets, outputs)
  print(f"The Accuracy Score is = {accuracy}")
  
  # Save our best model having the best accuracy
  if accuracy > best_accuracy:
    torch.save(model.state_dict(), "IncBERTModel.bin")
    best_accuracy = accuracy

  cpuset_checked))
100%|██████████| 628/628 [08:22<00:00,  1.25it/s]
100%|██████████| 314/314 [00:48<00:00,  6.44it/s]


The Accuracy Score is = 0.7714968152866242


100%|██████████| 628/628 [08:22<00:00,  1.25it/s]
100%|██████████| 314/314 [00:48<00:00,  6.43it/s]

The Accuracy Score is = 0.7714968152866242



100%|██████████| 628/628 [08:22<00:00,  1.25it/s]
100%|██████████| 314/314 [00:48<00:00,  6.42it/s]


The Accuracy Score is = 0.7714968152866242


100%|██████████| 628/628 [08:22<00:00,  1.25it/s]
100%|██████████| 314/314 [00:48<00:00,  6.43it/s]

The Accuracy Score is = 0.7714968152866242





In [15]:
def eval(model, tokenizer, desc1, desc2, device):
  inputs = tokenizer.encode_plus(
      desc1,
      desc2,
      add_special_tokens=True,
  )

  ids = torch.tensor([inputs["input_ids"]], dtype=torch.long)
  mask = torch.tensor([inputs["attention_mask"]], dtype=torch.long)
  token_type_ids = torch.tensor([inputs["token_type_ids"]], dtype=torch.long)

  ids = ids.to(device, dtype=torch.long)
  mask = mask.to(device, dtype=torch.long)
  token_type_ids = token_type_ids.to(device, dtype=torch.long)

  with torch.no_grad():
    model.eval()
    output = model(
        ids=ids,
        mask=mask,
        token_type_ids=token_type_ids,
    )
    prob = nn.Sigmoid()(output).item()
  
  print("desc: {} and desc {} are {} with a score {}".format(desc1, desc2, "similar" if prob > 0.5 else "not similar", prob))

In [16]:
tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", do_lower_case=True)

In [19]:
desc1 = "SRM muestra licitaciones"
desc2 = "Licitacin Visualizacin visualiza licitacin"

eval(model, tokenizer, desc1, desc2, device)

desc: SRM muestra licitaciones and desc Licitacin Visualizacin visualiza licitacin are similar with a score 0.8611737489700317


In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,desc1,desc2,is_duplicate
0,0,Hello need help member team cannot run report ...,Hello need help member team cannot run report ...,1
1,1,Reopen RITM,still not able run the ZKI ZKI MEN SAP using a...,1
2,2,Necesito todas aprobaciones compras SAP vayan ...,Suplencias Creacin suplencia usuario ausente,1
3,4,cuenta funcionalidad aprobador provisional,Suplencias Activar suplencia Creacin suplencia...,1
4,5,SRM muestra licitaciones,Licitacin Visualizacin visualiza licitacin,1


In [22]:
desc1 = "cuenta funcionalidad aprobador provisional"
desc2 = "Licitacin Visualizacin visualiza licitacin"

eval(model, tokenizer, desc1, desc2, device)

desc: cuenta funcionalidad aprobador provisional and desc Licitacin Visualizacin visualiza licitacin are not similar with a score 0.484678715467453
