## MOUNT DRIVE DATA


In [130]:
from google.colab import drive
import numpy as np
import pandas as pd
import sys

drive.mount('/content/gdrive/')
sys.path.append('/content/gdrive/My Drive/python')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [146]:
%cd /content/gdrive/My\ Drive/NLP\ TP\ II
!ls ./
!export CUDA_VISIBLE_DEVICES="0"

/content/gdrive/My Drive/NLP TP II
 acc_class.png	       modelo1.pt	 skip_s100.txt	   'TP II.ipynb'
 contagem_class.png    modelo2.pt	 skip_s100.txt.pt  'TP II NLP.ipynb'
 data.csv	       model.pt		 skip_s50.txt	    train_data.csv
 macmorpho-test.txt    perc_embedd.png	 skip_s50.txt.pt    tut1-model.pt
 macmorpho-train.txt   runs		 test_data.csv
[K     |████████████████████████████████| 81kB 2.5MB/s 
[31mERROR: google-colab 1.0.0 has requirement google-auth~=1.4.0, but you'll have google-auth 1.6.3 which is incompatible.[0m
[?25h

## IMPORTS E DEFINICAO DOS HYPER-PARAMETROS

In [132]:
import torch
from torch import nn
from torch import optim
from torch.utils import data
from torch.backends import cudnn
from torchtext.data import Field
from torchtext import data
from torchtext.vocab import Vectors
#from torch.utils.tensorboard import SummaryWriter

import tensorflow as tf
from tensorflow import summary
import datetime, os
%load_ext tensorboard

import matplotlib.pyplot as plt

import time
import random

from gensim.models import KeyedVectors
import pandas as pd

cudnn.benchmark = True

# Setting predefined arguments.
args = {
    'epoch_num': 80,     # 80 # Number of epochs.
    'lr': 3e-5,           # Learning rate.
    'weight_decay': 5e-4, # 5e-4 # L2 penalty.
    'momentum': 0.9,      # Momentum.
    'num_workers': 6,     # Number of workers on data loader.
    'batch_size': 64,     # Mini-batch size.
    'clip_norm': 6.0,     # 6  # Upper limit on gradient L2 norm ###
    'min_freq' : 2.0,
    'gradient_clipping' : True,
    'epoch_finetune' : 40, # 40
    'num_layers' : 2, # 2
    'seed' : 42,
    'hidden_dim' : 100,
    'embedding_dim': 100,
    'path_embed' : 'skip_s100.txt',
    'preprocess' : False,
    'train_model' : True
}

if torch.cuda.is_available():
    args['device'] = torch.device('cuda')
else:
    args['device'] = torch.device('cpu')

print(args['device'])

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
cuda


## PRÉ PROCESSAMENTO DOS DADOS

In [0]:
def preprocess_data(path):
  sizes = []
  text = []
  labels = []
  with open(path, 'r') as raw_train:

    for line in raw_train:
      phrase = ""
      phrase_labels = ""

      words = line.split(' ')
      len_seq = len(words)

      sizes.append(len(line.split()))

      for index, word in enumerate(words):
        w = word.split('_')
        phrase_labels = phrase_labels + w[1].strip('\n')
        phrase = phrase + w[0].lower()

        if index != len_seq-1:
          phrase = phrase + ' '
          phrase_labels = phrase_labels + ' '

      labels.append(phrase_labels)
      text.append(phrase)

  return text, labels, sizes

def save_data(output_file, text, labels):
  dataset = {'text': text, 'labels': labels}
  dados = pd.DataFrame(dataset)
  dados.to_csv(output_file)

In [0]:
if args['preprocess']:
  train_text, train_labels, train_size = preprocess_data('macmorpho-train.txt')
  test_text, test_labels, test_size = preprocess_data('macmorpho-test.txt')
  save_data('train_data.csv', train_text, train_labels)
  save_data('test_data.csv', test_text, test_labels)

## CARREGANDO DADOS PARA TORCHTEXT

In [135]:
tokenize = lambda x: x.split()
TEXT = Field(tokenize=tokenize, lower=True, include_lengths=True)

LABEL = Field(sequential=True, use_vocab=True, tokenize=tokenize)

fields = [(None, None), ('text', TEXT), ('labels', LABEL)]

train_data, test_data = data.TabularDataset.splits(
  path = '.',
  train = 'train_data.csv',
  test = 'test_data.csv',
  format = 'csv',
  fields = fields,
  skip_header = True)

train_data, valid_data = train_data.split(random_state = random.seed(args['seed']), split_ratio=0.9)

print(vars(train_data.examples[0]))

print(f'TRAIN DATA LENGTH: {len(train_data)} | VALIDATION DATA LENGTH: {len(valid_data)}')

{'text': ['segundo', 'o', 'diretor', 'do', 'departamento', 'geral', 'de', 'polícia', 'especializada', ',', 'luiz', 'mariano', ',', 'o', 'terceiro', 'sequestro', 'pode', 'ter', 'sido', 'tramado', 'por', 'ex-empregados', 'do', 'empresário', 'ou', 'por', 'pessoas', 'ligadas', 'à', 'família', '.'], 'labels': ['PREP', 'ART', 'N', 'PREP+ART', 'NPROP', 'NPROP', 'NPROP', 'NPROP', 'NPROP', 'PU', 'NPROP', 'NPROP', 'PU', 'ART', 'ADJ', 'N', 'V', 'V', 'PCP', 'PCP', 'PREP', 'N', 'PREP+ART', 'N', 'KC', 'PREP', 'N', 'PCP', 'PREP+ART', 'N', 'PU']}
TRAIN DATA LENGTH: 34153 | VALIDATION DATA LENGTH: 3795


### Carregando Embedding pré treinado

In [0]:
vectors = Vectors(name=args['path_embed'], cache='.')

### Construindo vocabulario

In [137]:
TEXT.build_vocab(train_data, 
                 min_freq = args['min_freq'],
                 vectors = vectors, 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 23245
Unique tokens in LABEL vocabulary: 28


In [138]:
def tag_percentage(tag_counts):
    
  total_count = sum([count for tag, count in tag_counts])
  
  tag_counts_percentages = [(tag, count, count/total_count) for tag, count in tag_counts]
      
  return tag_counts_percentages

for tag, count, percent in tag_percentage(LABEL.vocab.freqs.most_common()):
  print(f"{tag}\t{count}\t{percent*100:4.1f}%")

N	140430	21.4%
PU	95407	14.6%
V	68262	10.4%
NPROP	64649	 9.9%
PREP	63728	 9.7%
ART	47777	 7.3%
PREP+ART	41040	 6.3%
ADJ	29528	 4.5%
ADV	16582	 2.5%
KC	16090	 2.5%
PCP	13582	 2.1%
NUM	11649	 1.8%
PROADJ	10246	 1.6%
KS	8127	 1.2%
PRO-KS	7449	 1.1%
PROPESS	7393	 1.1%
PROSUB	4085	 0.6%
PDEN	3897	 0.6%
CUR	1826	 0.3%
PREP+PROADJ	1209	 0.2%
ADV-KS	708	 0.1%
PREP+PROSUB	459	 0.1%
PREP+PROPESS	345	 0.1%
IN	149	 0.0%
PREP+PRO-KS	138	 0.0%
PREP+ADV	46	 0.0%


### Construindo Bucket Iterator

In [0]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key = lambda x:len(x.text),
    sort_within_batch = True,
    batch_size = args['batch_size'],
    device = args['device'])

### Definição da Arquitetura

In [0]:
class BiLstmTagger(nn.Module):
  
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
               num_layers, batch_size, embedding_weigths, pad_idx, 
               unk_idx):
    super(BiLstmTagger, self).__init__()
    
    self.num_layers = num_layers
    self.batch_size = batch_size
    self.hidden_dim = hidden_dim

    self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
    self.init_embedding(embedding_weigths, pad_idx, unk_idx, embedding_dim)

    self.bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=True)
    self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
    self.dropout = nn.Dropout(0.25)
    
  def init_embedding(self, weights, pad_idx, unk_idx, embedding_size):
    self.embed.weight.data.copy_(weights)
    self.embed.weight.data[unk_idx] = torch.zeros(embedding_size)
    self.embed.weight.data[pad_idx] = torch.zeros(embedding_size)
    self.embed.weight.requires_grad=False ## Freeze no update dos embedding pre treinados ate que a rede tenha aprendido

  def forward(self, text, lengths):
    ## Inicializa automaticamente os estados internos com zeros
    
    ## Empacote a sequência antes de alimentar a unidade recorrente
    embedding = self.embed(text)
    packed_embedding = nn.utils.rnn.pack_padded_sequence(embedding, lengths)
    
    ## Forward recorrente
    packed_output, (hn, cn) = self.bilstm(packed_embedding)
    unpacked_output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

    ## Linear layer
    out = self.fc(self.dropout(unpacked_output))

    ## Cross entropy loss already inclues softmax at the end so there is no need for that
    
    return out

### Funções Auxiliares

In [0]:
def initialize_weights(*models):
  for model in models:
    for k, module in enumerate(model.modules()):
      if isinstance(module, nn.LSTM):
        for name, param in module.named_parameters():
          if 'weight' in name:
            nn.init.xavier_normal_(param.data)
          elif 'bias' in name:
            nn.init.constant_(param.data, 0)
      elif isinstance(module, nn.Linear):
        nn.init.xavier_normal_(module.weight)
        if module.bias is not None:
            module.bias.data.zero_()


def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

def categorical_accuracy(preds, y, tag_pad_idx):
  """
  Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
  """
  max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
  non_pad_elements = (y != tag_pad_idx).nonzero()
  correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
  return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [142]:
output_dim = len(LABEL.vocab)
vocab_size     = len(TEXT.vocab)

pretrained_embeddings = TEXT.vocab.vectors
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

net = BiLstmTagger(vocab_size, args['embedding_dim'], args['hidden_dim'],
                  output_dim, args['num_layers'], args['batch_size'],
                  pretrained_embeddings, PAD_IDX,
                  UNK_IDX).to(args['device'])

initialize_weights(net)
print(net)

print(f'The model has {count_parameters(net):,} trainable parameters')

BiLstmTagger(
  (embed): Embedding(23245, 100, padding_idx=1)
  (bilstm): LSTM(100, 100, num_layers=2, bidirectional=True)
  (fc): Linear(in_features=200, out_features=28, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)
The model has 408,828 trainable parameters


In [143]:
optimizer = optim.Adam(net.parameters(),
                       lr=args['lr'],
                       betas=(args['momentum'], 0.999),
                       weight_decay=args['weight_decay'])

for state in optimizer.state.values():
  for k, v in state.items():
    if isinstance(v, torch.Tensor):
        state[k] = v.to(args['device'])

TAG_PAD_IDX = LABEL.vocab.stoi[LABEL.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX).to(args['device'])

current_time = str(datetime.datetime.now().timestamp())

AttributeError: ignored

### Treinando Modelo e Avaliando Acurácia no Teste

In [0]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    
  epoch_loss = 0
  epoch_acc = 0
  
  model.train()
  
  for batch in iterator:
      
    

    text, lengths = batch.text
    tags = batch.labels
    
    optimizer.zero_grad()
      
    predictions = model(text, lengths)
    
    predictions = predictions.view(-1, predictions.shape[-1])
    tags = tags.view(-1)

    loss = criterion(predictions, tags)
            
    acc = categorical_accuracy(predictions, tags, tag_pad_idx)
    
    loss.backward()

    if args['gradient_clipping']:
      torch.nn.utils.clip_grad_norm_(net.parameters(), args['clip_norm'])
    
    optimizer.step()
    
    epoch_loss += loss.item()
    epoch_acc += acc.item()
      
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion, tag_pad_idx):
    
  epoch_loss = 0
  epoch_acc = 0
  
  model.eval()
  
  with torch.no_grad():
  
    for batch in iterator:

      text, lengths = batch.text
      tags = batch.labels
      
      predictions = model(text, lengths)
      
      predictions = predictions.view(-1, predictions.shape[-1])
      tags = tags.view(-1)
      
      loss = criterion(predictions, tags)
      
      acc = categorical_accuracy(predictions, tags, tag_pad_idx)

      epoch_loss += loss.item()
      epoch_acc += acc.item()
      
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

best_valid_loss = float('inf')

if args['train_model']:
  for epoch in range(args['epoch_num']):

    if epoch == args['epoch_finetune']: ## Unfreeze embedding layer finetunning
      net.embed.weight.requires_grad=True
      net.dropout.p = 0.4

    start_time = time.time()
    
    train_loss, train_acc = train(net, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(net, valid_iterator, criterion, TAG_PAD_IDX)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(net.state_dict(), 'model.pt')

    '''with train_summary_writer.as_default():
      tf.summary.scalar('train-loss', train_loss, step=epoch)
      tf.summary.scalar('train-acc', train_acc, step=epoch)
      tf.summary.scalar('validation-loss', valid_loss, step=epoch)
      tf.summary.scalar('validation-acc', valid_acc, step=epoch)'''
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  test_loss, test_acc = evaluate(net, test_iterator, criterion, TAG_PAD_IDX)

  print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

else: ## NOT WORKING PROPERLY
  net.load_state_dict(torch.load('model.pt'))

In [0]:
%tensorboard --logdir logs/tensorboard

In [0]:
net.eval()
  
errors_dict = {i : 0 for i, _ in enumerate(list(LABEL.vocab.stoi))}
all_labels = []

with torch.no_grad():

  for batch in test_iterator:

    text, lengths = batch.text
    tags = batch.labels
    
    predictions = net(text, lengths)
    
    predictions = predictions.view(-1, predictions.shape[-1])
    preds = predictions.argmax(dim = 1, keepdim = True).cpu().numpy()

    tags = tags.view(-1).cpu().numpy().reshape(preds.shape[0], 1)
    tags_list = tags.tolist()

    flat_tags = [item for sublist in tags_list for item in sublist]
    all_labels.append(flat_tags)

    errors =  (preds!=tags)
    errors_index, _ = np.where(errors)

    for error_idx in preds[errors_index]:
      errors_dict[error_idx[0]] = errors_dict[error_idx[0]] + 1
      

In [0]:
named_errors_dict = {LABEL.vocab.itos[i]: value for i, value in errors_dict.items()}
x = named_errors_dict
del x['<pad>']
del x['<unk>']
sorted_x = sorted(x.items(), key=lambda kv: kv[1], reverse=True)

names = [name for name, _ in sorted_x]
values = [value for _, value in sorted_x]
sorted_x

In [0]:
plt.figure(figsize=(16,9))

x = list(range(len(values)))

plt.bar(x, height=values)
plt.xticks(fontsize=14, rotation=90)
plt.xticks(x, names)
plt.yticks(list(np.arange(0, 8500, 500)))
plt.ylabel('Contagem de classificações erradas')
plt.xlabel('Classe')
plt.title('Contagem de classificações erradas por classe no conjunto de teste')
plt.savefig('contagem_class.png', dpi=300)

In [0]:
LABEL.build_vocab(test_data)

perc_class = []

for i, (tag, count, percent) in enumerate(tag_percentage(LABEL.vocab.freqs.most_common())):
  print(f"{tag}\t{count}\t{percent*100:4.1f}%")
  perc_class.append(values[i]/count)

accuracy_per_class = np.around(1-np.array(perc_class), 2)

plt.figure(figsize=(16,9))

x = list(range(len(values)))

plt.bar(x, height=accuracy_per_class)
plt.xticks(fontsize=14, rotation=90)
plt.xticks(x, names)
plt.yticks(list(np.arange(0, 1.05, 0.05)))
plt.ylabel('Acurácia')
plt.xlabel('Classe')
plt.title("Acurácia por classe no conjunto de teste")
plt.savefig('acc_class.png', dpi=300)

In [0]:
model = KeyedVectors.load_word2vec_format('skip_s100.txt')

In [0]:
tdata = pd.read_csv('test_data.csv', index_col="Unnamed: 0")

In [0]:
nembedd_dict = {name: 0 for name in LABEL.vocab.itos if name not in '<pad><unk>'}
test_labels_dict = {name: 0 for name in LABEL.vocab.itos if name not in '<pad><unk>'}


for idx, line in enumerate(tdata['text']):
  words = line.split()
  labels = tdata['labels'][idx].split()

  for i, word in enumerate(words):
    label = labels[i]
    test_labels_dict[label] = test_labels_dict[label] + 1
    try:
      model.get_vector(word)
    except KeyError:
      nembedd_dict[label] = nembedd_dict[label] + 1 

In [0]:
embedd_stats = []
for name in names:
  embedd_stats.append(nembedd_dict[name]/test_labels_dict[name])

In [0]:
fault_per_class = np.around(np.array(embedd_stats)*100, 3)

plt.figure(figsize=(16,9))

x = list(range(len(fault_per_class)))

plt.bar(x, height=fault_per_class)
plt.xticks(fontsize=14, rotation=90)
plt.xticks(x, names)
plt.yticks(list(np.arange(0, 70, 5)))
plt.ylabel('Porcentagem de amostras sem embedding no modelo pré-treinado')
plt.xlabel('Classe')
plt.title("Porcentagem de amostras sem embedding no modelo pré-treinado por classe")
plt.savefig('perc_embedd.png', dpi=300)