<a href="https://colab.research.google.com/github/viniciusacosta/olist-nlp/blob/main/train_nlp_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# Importing libraries for datasets manipulation and exploration
import pandas as pd
import numpy as np
import csv

# Importing libraries for NLP Modeling
import os
import torch
from torch import optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader

# Importing libraries for model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Import other usefull libraries
from tqdm.notebook import tqdm
from random import shuffle
import math

# Importing libraries to hide Future Warnings
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [3]:
!wget https://raw.githubusercontent.com/b2wdigital/b2w-reviews01/master/B2W-Reviews01.csv

--2022-11-24 12:32:05--  https://raw.githubusercontent.com/b2wdigital/b2w-reviews01/master/B2W-Reviews01.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49453175 (47M) [text/plain]
Saving to: ‘B2W-Reviews01.csv’


2022-11-24 12:32:09 (402 MB/s) - ‘B2W-Reviews01.csv’ saved [49453175/49453175]



In [4]:
# Loading and Preparing data
with open('B2W-Reviews01.csv', encoding='Latin1') as f:
  reader = csv.reader(f, delimiter=',', quotechar='\"')
  corpus = list(reader)

  header, corpus = corpus[0], corpus[1:]

corpus = corpus[:20000]

reviews = [w[10] for w in corpus]
ratings = [2 if w[8] in ['4', '5'] else 0 if w[8] in ['1', '2'] else 1 for w in corpus]
data = [{'X': review, 'y': rating } for (review, rating) in zip(reviews, ratings)]

In [5]:
# Splitting data into train and test
size = int(len(data) * 0.2)
treino = data[size:]
teste = data[:size]

len(treino), len(teste)

(16000, 4000)

In [6]:
# Neural Network Parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
nclasses = 3
nepochs = 5
batch_size = 5
batch_status = 32
learning_rate = [1e-5, 1e-6, 1e-7]
early_stop = 2

max_length = 180
write_path = 'model'

In [7]:
# Splitting data into batches
traindata = DataLoader(treino, batch_size=batch_size, shuffle=True)
testdata = DataLoader(teste, batch_size=batch_size, shuffle=True)

In [8]:
# Initializing Tokenizer, BERT Model and Optimizer
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
model = AutoModelForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=nclasses).to(device)

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

In [9]:
# Creating evaluation function
def evaluate(model, testdata):
  model.eval()
  y_real, y_pred = [], []
  for inp in testdata:
    texts, labels = inp['X'], inp['y']
    
    # classifying
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)
    output = model(**inputs)
                
    pred_labels = torch.argmax(output.logits, 1)
    
    y_real.extend(labels.tolist())
    y_pred.extend(pred_labels.tolist())
    
  # Model evaluation metrics
  f1        = f1_score(y_real, y_pred, average='weighted') # Determines the harmonic mean between model accuracy and recallo
  acc       = accuracy_score(y_real, y_pred)               # Determines the predicted overall accuracy of the model
  precision = precision_score(y_real, y_pred, average='weighted')              # Determina a proporção de classificações positivas, que realmente são positivas
  recall    = recall_score(y_real, y_pred, average='weighted')                 # Determina a proporção de registros positivos que foram classificados pelo algoritmo  como positivos

  return f1, acc, precision, recall

In [11]:
max_f1 = 0
for rate in learning_rate: 
    print(f"################ Learning Rate: {rate}  ################")
    optimizer = optim.AdamW(model.parameters(), lr=rate)
    repeat = 0

    for epoch in tqdm(range(nepochs), desc='Train', unit='steps', position=0, leave=True):

          model.train()
          f1, acc, precision, recall = evaluate(model, testdata)

          for inp in traindata:
            texts, labels = inp['X'], inp['y']

            # Classifying
            inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)
            output = model(**inputs, labels=labels.to(device))

            # Calculate loss
            loss = output.loss

            # Backpropagation
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

          # Saving Metrics
          f1, acc, precision, recall = evaluate(model, testdata)

          # Saving Best Model
          if f1 > max_f1:
            model.save_pretrained(f'/gdrive/MyDrive/data_science/NLP/nlp-model: {rate}')
            max_f1 = f1
            repeat = 0
            print('F1:', f1, ' | Accuracy:', acc, ' | Precision:',precision, ' | Recall:', recall)
            print('Saving best model...\n')  
          else:
            repeat += 1

          # Breaking Loop
          if repeat == early_stop:
            break

################ Learning Rate: 1e-05  ################


Train:   0%|          | 0/5 [00:00<?, ?steps/s]

F1: 0.8305103316612905  | Accuracy: 0.8285  | Precision: 0.8337313973904343  | Recall: 0.8285
Saving best model...

################ Learning Rate: 1e-06  ################


Train:   0%|          | 0/5 [00:00<?, ?steps/s]

################ Learning Rate: 1e-07  ################


Train:   0%|          | 0/5 [00:00<?, ?steps/s]

F1: 0.8309625597784164  | Accuracy: 0.83475  | Precision: 0.8281582434202217  | Recall: 0.83475
Saving best model...

F1: 0.8312076720180308  | Accuracy: 0.83425  | Precision: 0.8289370629213093  | Recall: 0.83425
Saving best model...

F1: 0.8319881798723731  | Accuracy: 0.835  | Precision: 0.8297518470327669  | Recall: 0.835
Saving best model...

F1: 0.8321150709454994  | Accuracy: 0.83475  | Precision: 0.8302057645837393  | Recall: 0.83475
Saving best model...

