In [1]:
# based on https://github.com/dnanhkhoa/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py

In [2]:
import os
import time
import copy
import gensim, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import gensim.parsing.preprocessing as gsp

from gensim import utils
from unidecode import unidecode
from nltk.corpus import stopwords

from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from pytorch_pretrained_bert import BertConfig, BertTokenizer, BertModel, BertForMaskedLM

from transformers import AutoModel, AutoTokenizer


from PIL import Image
from random import randrange

# from __future__ import print_function, division

I1021 09:10:55.916649 16096 file_utils.py:41] PyTorch version 1.9.0+cpu available.
I1021 09:11:01.964714 16096 file_utils.py:57] TensorFlow version 2.1.0 available.


In [3]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

I1021 09:11:08.451135 16096 configuration_utils.py:283] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/neuralmind/bert-base-portuguese-cased/config.json from cache at C:\Users\Windows\.cache\torch\transformers\aac3429673975db22f5d8a9202bc6a8983145bbd621577c9f2f62bee7fd02934.c6449db73a9350063f76a64baf5b26ca3759c9435babbd865baa989b009eb662
I1021 09:11:08.453136 16096 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encod

In [10]:
class BertLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        """Construct a layernorm module in the TF style (epsilon inside the square root).
        """
        super(BertLayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias
        

class BertForSequenceClassification(nn.Module):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    Params:
        `config`: a BertConfig class instance with the configuration to build a new model.
        `num_labels`: the number of classes for the classifier. Default = 2.
    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_labels].
    Outputs:
        if `labels` is not `None`:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if `labels` is `None`:
            Outputs the classification logits of shape [batch_size, num_labels].
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    num_labels = 2
    model = BertForSequenceClassification(config, num_labels)
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, num_labels=2):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
        self.dropout = nn.Dropout(0.15)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        nn.init.xavier_normal_(self.classifier.weight)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True
            
class TweetDataset(Dataset):
    def __init__(self,x_y_list, transform=None):
        
        self.x_y_list = x_y_list
        self.transform = transform
        
    def __getitem__(self,index):
        
        tokenized_review = tokenizer.tokenize(self.x_y_list[0][index])
        
        if len(tokenized_review) > max_seq_length:
            tokenized_review = tokenized_review[:max_seq_length]
            
        ids_review  = tokenizer.convert_tokens_to_ids(tokenized_review)

        padding = [0] * (max_seq_length - len(ids_review))
        
        ids_review += padding
        
        assert len(ids_review) == max_seq_length
        
        #print(ids_review)
        ids_review = torch.tensor(ids_review)
        
        labels = self.x_y_list[1][index] # color        
        list_of_labels = [torch.from_numpy(np.array(labels))]
        
        
        return ids_review, list_of_labels[0]
    
    def __len__(self):
        return len(self.x_y_list[0])            

In [11]:
config = BertConfig(vocab_size_or_config_json_file=29794, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

num_labels = 2
max_seq_length = 32

In [12]:
def clean_text(s):
    """
        Simple preprocessing. 
        We found out that taking to much time preprocessing tweets decreased the classification performance.
        This needs further investigation, but an possible explanation would be that with preprocessing we lose
        too much information.
    """
    
    s = str(s)
    s = unidecode(s)
    
    return s

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device, torch.cuda.is_available())

cpu False


In [14]:
def train_model(model, criterion, optimizer, num_epochs=25):
    since = time.time()
    
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100
    best_acc = 0
    acc_train = 0
    acc_train_min = 0.0
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                #scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            
            corrects = 0
            
            
            # Iterate over data.
            for inputs, vec in dataloaders_dict[phase]:
                inputs = inputs.to(device) 

                vec = vec.to(device)
                
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)

                    outputs = F.softmax(outputs,dim=1)
                    
                    loss = criterion(outputs, torch.max(vec.float(), 1)[1])
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)

                
                corrects += torch.sum(torch.max(outputs, 1)[1] == torch.max(vec, 1)[1])

                
            epoch_loss = running_loss / dataset_sizes[phase]

            
            acc = corrects.double() / dataset_sizes[phase]
            if phase == 'val':
                epoch_acc = acc
            else:
                acc_train = acc

            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} acc: {:.4f}'.format(phase, acc))

#             if phase == 'val' and epoch_loss < best_loss:
            
            if acc_train >= acc_train_min and phase == 'val' and (epoch_acc > best_acc or epoch_acc == best_acc and epoch_loss < best_loss):
            
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_final.pth')
                

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(float(best_loss)))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [16]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score

import pandas as pd


def get_dataloaders(X_train, X_test, y_train, y_test):
    
    X_train = X_train.tolist()
    X_test = X_test.tolist()

    y_train = pd.get_dummies(y_train).values.tolist()
    y_test = pd.get_dummies(y_test).values.tolist()

    train_lists = [X_train, y_train]
    test_lists = [X_test, y_test]

    training_dataset = TweetDataset(x_y_list = train_lists )

    test_dataset = TweetDataset(x_y_list = test_lists )

    dataloaders_dict = {
        'train': torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True, num_workers=0),
        'val':torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    }
    dataset_sizes = {'train':len(train_lists[0]),
                    'val':len(test_lists[0])}
    
    
    return [dataloaders_dict, dataset_sizes]

lrlast = .0001
lrmain = .00001
batch_size = 32
epochs = 1

dat = pd.read_csv('final_train.csv')
dat = dat.dropna()

X = dat['tweet'].to_numpy()
y = dat['label'].to_numpy()

acc_values = []
f1_scores = []

kf = KFold(n_splits=3)
k = 0
for train_index, test_index in kf.split(X):
    print("CROSS VALIDATION K = %d" % (k))
    k += 1
    
    X_train = X[train_index]
    y_train = y[train_index]
    
    X_test = X[test_index]
    y_test = y[test_index]
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
    
    model = BertForSequenceClassification(num_labels)
    model.to(device)
    
    optim1 = optim.Adam([
            {"params":model.bert.parameters(), "lr": lrmain},
            {"params":model.classifier.parameters(), "lr": lrlast},

    ])


    optimizer_ft = optim1
    criterion = nn.CrossEntropyLoss()

    dataloaders_dict, dataset_sizes = get_dataloaders(X_train, X_val, y_train, y_val)
    
    
    
    model_kfold = train_model(model, criterion, optimizer_ft, num_epochs=epochs)
    model_kfold.to(device)
    model_kfold.eval()
    
    
    
    """
        Perform predictions
    """
    
    sentences = X_test
    labels = y_test
    
    input_ids = []
    attention_masks = []
    
    for sent in sentences:
        
        encoded_dict = tokenizer.encode_plus(sent, add_special_tokens=True, max_length=64,
                                             pad_to_max_length=True, return_attention_mask=True,
                                             return_tensors='pt')
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    
    prediction_data = TensorDataset(input_ids, attention_masks, labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler,
                                       batch_size=batch_size)
    
    
    predictions, true_labels = [], []
    
    for batch in prediction_dataloader:
        batch = tuple(t.to(device) for t in batch)
        
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            
        logits = outputs
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        
        predictions.append(logits)
        true_labels.append(label_ids)
        
    predictions = np.argmax(np.concatenate(predictions, axis=0), axis=1).flatten()
    true_labels = np.concatenate(true_labels, axis=0)
    acc = accuracy_score(predictions, true_labels)
    f1s = f1_score(predictions, true_labels)
    
    print("\n>> TEST -  ACC: %.3f, F1-Score: %.3f" % (acc, f1s))
    
    acc_values.append(acc)
    f1_scores.append(f1s)
    print("\n\n")





CROSS VALIDATION K = 0


I1021 09:41:06.483866 16096 configuration_utils.py:283] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/neuralmind/bert-base-portuguese-cased/config.json from cache at C:\Users\Windows\.cache\torch\transformers\aac3429673975db22f5d8a9202bc6a8983145bbd621577c9f2f62bee7fd02934.c6449db73a9350063f76a64baf5b26ca3759c9435babbd865baa989b009eb662
I1021 09:41:06.486843 16096 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encod

starting
Epoch 0/0
----------
train total loss: 0.5709 
train acc: 0.7209
val total loss: 0.5075 
val acc: 0.8078
saving with loss of 0.5075496615103956 improved over previous 100

Training complete in 10m 22s
Best val Acc: 0.507550

>> TEST -  ACC: 0.713, F1-Score: 0.770



CROSS VALIDATION K = 1


I1021 09:53:27.584227 16096 configuration_utils.py:283] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/neuralmind/bert-base-portuguese-cased/config.json from cache at C:\Users\Windows\.cache\torch\transformers\aac3429673975db22f5d8a9202bc6a8983145bbd621577c9f2f62bee7fd02934.c6449db73a9350063f76a64baf5b26ca3759c9435babbd865baa989b009eb662
I1021 09:53:27.584227 16096 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encod

starting
Epoch 0/0
----------
train total loss: 0.5737 
train acc: 0.7199
val total loss: 0.5264 
val acc: 0.7795
saving with loss of 0.5263782143592834 improved over previous 100

Training complete in 10m 6s
Best val Acc: 0.526378

>> TEST -  ACC: 0.785, F1-Score: 0.818



CROSS VALIDATION K = 2


I1021 10:05:32.709018 16096 configuration_utils.py:283] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/neuralmind/bert-base-portuguese-cased/config.json from cache at C:\Users\Windows\.cache\torch\transformers\aac3429673975db22f5d8a9202bc6a8983145bbd621577c9f2f62bee7fd02934.c6449db73a9350063f76a64baf5b26ca3759c9435babbd865baa989b009eb662
I1021 10:05:32.712074 16096 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encod

starting
Epoch 0/0
----------
train total loss: 0.5754 
train acc: 0.7199
val total loss: 0.5165 
val acc: 0.7854
saving with loss of 0.5165004179162799 improved over previous 100

Training complete in 10m 25s
Best val Acc: 0.516500

>> TEST -  ACC: 0.805, F1-Score: 0.828





In [None]:
pd_results = pd.DataFrame({
    'acc': acc_values,
    'f1_score': f1_scores
})
pd_results.to_csv('results/BERT.csv')