# 0 Goal
- Using Hugging Face Dataset
- Follow EducativeIO StepByStep class/flow to train a multi-class classifier

# 1 Imports


In [1]:
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
#import torch.functional as F
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

import os
import json


from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import make_moons
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc

from argparse import Namespace

dir_base="/Users/chang/Documents/dev/git/opus/04_ml/general/pytorch/PyTorchNLPBook"






# 2 HuggingFace 
- 1. Load dataset from HuggingFace
- 2. Play with some HF Dataset operators
- 3. Convert to Torch Dataset and Dataloadersm

### 2.1 Load Dataset from HuggingFace
- https://huggingface.co/datasets/viewer/

In [2]:
from datasets import list_datasets
datasets_list = list_datasets()
#datasets_list

In [3]:
# https://huggingface.co/datasets/viewer/
from datasets import load_dataset
dataset_agnews_train, dataset_agnews_test = load_dataset('ag_news', split=['train', 'test'])
#dataset_amazon_review = load_dataset('amazon_us_reviews', 'Wireless_v1_00', split='train[:10%]') # take only 10% of data; amazonreview is 1.7GB


Using custom data configuration default
Reusing dataset ag_news (/Users/chang/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

### 2.2 How to manipuate HF Dataset
- https://huggingface.co/docs/datasets/processing.html

#### Metadata

In [4]:
display(type(dataset_agnews_train))
dataset_agnews_train


datasets.arrow_dataset.Dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 120000
})

In [18]:
dataset_agnews_train.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=4, names=['World', 'Sports', 'Business', 'Sci/Tech'], names_file=None, id=None)}

In [10]:
display(dataset_agnews_train[0:1])

{'text': ["Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."],
 'label': [2]}

In [42]:
dataset_agnews_train.features['text']

Value(dtype='string', id=None)

#### Split dataset

In [None]:
# inplace split
dataset_agnews_train.train_test_split(test_size=0.2)

#### Filter

In [29]:
dataset_agnews_train.filter(lambda e: e['text'].startswith('Wall Street'))[:1]

Loading cached processed dataset at /Users/chang/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-ca8d37282eca128f.arrow


{'text': ['Wall Street to Open Little Changed  NEW YORK (Reuters) - Wall Street is seen opening little  changed on Monday as crude prices remain high, but insurers may  dip on worries about their potential liabilities after a  hurricane struck Florida on Friday.'],
 'label': [2]}

### 2.3 Convert HF Dataset with Pytorch Dataset and DataLoader
- https://huggingface.co/docs/datasets/torch_tensorflow.html

- One can convert HF to different formats (ie Torch, TF, pandas)

In [5]:
# First use HF dataset and pre-process with Bert
from transformers import AutoTokenizer
#dataset = load_dataset('glue', 'mrpc', split='train')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# To use datasets.Dataset.map() to update elements one needs provide a function with the following signature: 
# function(example: dict) -> dict
def preprocess(e):
    return tokenizer(e['text'], truncation=True, padding='max_length')

dataset_dev = dataset_agnews_train.map(preprocess, batched=True, num_proc=2)
dataset_test = dataset_agnews_test.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length'), batched=True)


Loading cached processed dataset at /Users/chang/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-1ea5f6dcf9161ad3.arrow


In [6]:
type(dataset_dev)

datasets.arrow_dataset.Dataset

In [13]:

# Convert to TORCH and keep which columns
dataset_dev.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataset_test.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

# Define torch Dataloader
dataloader_dev = torch.utils.data.DataLoader(dataset_dev, batch_size=32)
dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=32)

In [8]:
dataset_dev

Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
    num_rows: 120000
})

# 2 Educative IO Torch Pipeline
- References:
    * Notes on Google Cloud Pytorch
    * Notebook: /Users/chang/Documents/dev/git/opus/04_ml/general/pytorch/DeepLearningWithPytorch[EducativeIO]/Chap5_ClassificationProb.ipynb

### 2.1 Class

In [9]:
import numpy as np
import datetime
import torch
import torch.optim as optim
import torch.nn as nn
import torch.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.utils.tensorboard import SummaryWriter

class StepByStep(object):
    def __init__(self, model, loss_fn, optimizer):
        # Here we define the attributes of our class
        
        # We start by storing the arguments as attributes 
        # to use them later
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # Let's send the model to the specified device right away
        self.model.to(self.device)

        # These attributes are defined here, but since they are
        # not informed at the moment of creation, we keep them None
        self.train_loader = None
        self.val_loader = None
        self.writer = None
        
        # These attributes are going to be computed internally
        self.losses = []
        self.val_losses = []
        self.total_epochs = 0

        # Creates the train_step function for our model, 
        # loss function and optimizer
        # Note: there are NO ARGS there! It makes use of the class
        # attributes directly
        self.train_step = self._make_train_step()
        # Creates the val_step function for our model and loss
        self.val_step = self._make_val_step()

    def to(self, device):
        # This method allows the user to specify a different device
        # It sets the corresponding attribute (to be used later in
        # the mini-batches) and sends the model to the device
        self.device = device
        self.model.to(self.device)

    def set_loaders(self, train_loader, val_loader=None):
        # This method allows the user to define which train_loader (and val_loader, optionally) to use
        # Both loaders are then assigned to attributes of the class
        # So they can be referred to later
        self.train_loader = train_loader
        self.val_loader = val_loader

    def set_tensorboard(self, name, folder='runs'):
        # This method allows the user to define a SummaryWriter to interface with TensorBoard
        suffix = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
        self.writer = SummaryWriter('{}/{}_{}'.format(
            folder, name, suffix
        ))

    def _make_train_step(self):
        # This method does not need ARGS... it can refer to
        # the attributes: self.model, self.loss_fn and self.optimizer
        
        # Builds function that performs a step in the train loop
        def perform_train_step(x, y):
            # Sets model to TRAIN mode
            self.model.train()

            # Step 1 - Computes our model's predicted output - forward pass
            yhat = self.model(x)
            # Step 2 - Computes the loss
            loss = self.loss_fn(yhat, y)
            # Step 3 - Computes gradients for both "a" and "b" parameters
            loss.backward()
            # Step 4 - Updates parameters using gradients and the learning rate
            self.optimizer.step()
            self.optimizer.zero_grad()

            # Returns the loss
            return loss.item()

        # Returns the function that will be called inside the train loop
        return perform_train_step
    
    def _make_val_step(self):
        # Builds function that performs a step in the validation loop
        def perform_val_step(x, y):
            # Sets model to EVAL mode
            self.model.eval()

            # Step 1 - Computes our model's predicted output - forward pass
            yhat = self.model(x)
            # Step 2 - Computes the loss
            loss = self.loss_fn(yhat, y)
            # There is no need to compute Steps 3 and 4, 
            # since we don't update parameters during evaluation
            return loss.item()

        return perform_val_step
            
    def _mini_batch(self, validation=False):
        # The mini-batch can be used with both loaders
        # The argument `validation`defines which loader and 
        # corresponding step function is going to be used
        if validation:
            data_loader = self.val_loader
            step = self.val_step
        else:
            data_loader = self.train_loader
            step = self.train_step

        if data_loader is None:
            return None
            
        # Once the data loader and step function, this is the 
        # same mini-batch loop we had before
        mini_batch_losses = []
        for x_batch, y_batch in data_loader:
            x_batch = x_batch.to(self.device)
            y_batch = y_batch.to(self.device)

            mini_batch_loss = step(x_batch, y_batch)
            mini_batch_losses.append(mini_batch_loss)

        loss = np.mean(mini_batch_losses)
        return loss

    def set_seed(self, seed=42):
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False    
        torch.manual_seed(seed)
        np.random.seed(seed)
    
    def train(self, n_epochs, seed=42):
        # To ensure reproducibility of the training process
        self.set_seed(seed)

        for epoch in range(n_epochs):
            # Keeps track of the numbers of epochs
            # by updating the corresponding attribute
            self.total_epochs += 1

            # inner loop
            # Performs training using mini-batches
            loss = self._mini_batch(validation=False)
            self.losses.append(loss)

            # VALIDATION
            # no gradients in validation!
            with torch.no_grad():
                # Performs evaluation using mini-batches
                val_loss = self._mini_batch(validation=True)
                self.val_losses.append(val_loss)

            # If a SummaryWriter has been set...
            if self.writer:
                scalars = {'training': loss}
                if val_loss is not None:
                    scalars.update({'validation': val_loss})
                # Records both losses for each epoch under the main tag "loss"
                self.writer.add_scalars(main_tag='loss',
                                        tag_scalar_dict=scalars,
                                        global_step=epoch)

        if self.writer:
            # Closes the writer
            self.writer.close()

    def save_checkpoint(self, filename):
        # Builds dictionary with all elements for resuming training
        checkpoint = {'epoch': self.total_epochs,
                      'model_state_dict': self.model.state_dict(),
                      'optimizer_state_dict': self.optimizer.state_dict(),
                      'loss': self.losses,
                      'val_loss': self.val_losses}

        torch.save(checkpoint, filename)

    def load_checkpoint(self, filename):
        # Loads dictionary
        checkpoint = torch.load(filename)

        # Restore state for model and optimizer
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        self.total_epochs = checkpoint['epoch']
        self.losses = checkpoint['loss']
        self.val_losses = checkpoint['val_loss']

        self.model.train() # always use TRAIN for resuming training   

    def predict(self, x):
        # Set is to evaluation mode for predictions
        self.model.eval() 
        # Takes aNumpy input and make it a float tensor
        x_tensor = torch.as_tensor(x).float()
        # Send input to device and uses model for prediction
        y_hat_tensor = self.model(x_tensor.to(self.device))
        # Set it back to train mode
        self.model.train()
        # Detaches it, brings it to CPU and back to Numpy
        return y_hat_tensor.detach().cpu().numpy()

    def plot_losses(self):
        fig = plt.figure(figsize=(10, 4))
        plt.plot(self.losses, label='Training Loss', c='b')
        plt.plot(self.val_losses, label='Validation Loss', c='r')
        plt.yscale('log')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.tight_layout()
        return fig

    def add_graph(self):
        # Fetches a single mini-batch so we can use add_graph
        if self.train_loader and self.writer:
            x_sample, y_sample = next(iter(self.train_loader))
            self.writer.add_graph(self.model, x_sample.to(self.device))



### 2.2 Model Configuration

In [14]:
# Sets learning rate - this is "eta" ~ the "n" like Greek letter
lr = 0.1

torch.manual_seed(42)
model = nn.Sequential()
model.add_module('linear', nn.Linear(2, 1))

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD(model.parameters(), lr=lr)

# Defines a BCE loss function
loss_fn = nn.BCEWithLogitsLoss()


### 2.2 Model Training

In [15]:
type(dataloader_dev)

torch.utils.data.dataloader.DataLoader

In [16]:
n_epochs = 100

sbs = StepByStep(model, loss_fn, optimizer) 

# Issue is that we have a bert sequence format, but SBS is expecting a tensor 
# TODO: convert model to one that takes that input; look at Michael's code !!!
# Example: 
#.   Wrap HF Bert inside Torch model
#.   https://www.analyticsvidhya.com/blog/2021/05/all-you-need-to-know-about-bert/
sbs.set_loaders(dataloader_dev, dataloader_test) 

sbs.train(n_epochs)


ValueError: too many values to unpack (expected 2)

### 2.3 Model Parameters

In [None]:
# printing the parameter values of the Linear model
print(model.state_dict())

### 2.4 Model Evaluations

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [None]:
logits_val = sbs.predict(X_val)
probabilities_val = sigmoid(logits_val).squeeze()
cm_thresh50 = confusion_matrix(y_val, (probabilities_val >= 0.5))
print(cm_thresh50)

# 6 Inference

In [55]:
def predict_nationality(surname, classifier, vectorizer):
    vectorized_surname, vec_length = vectorizer.vectorize(surname)
    vectorized_surname = torch.tensor(vectorized_surname).unsqueeze(dim=0)
    vec_length = torch.tensor([vec_length], dtype=torch.int64)
    
    result = classifier(vectorized_surname, vec_length, apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    
    index = indices.item()
    prob_value = probability_values.item()

    predicted_nationality = vectorizer.nationality_vocab.lookup_index(index)

    return {'nationality': predicted_nationality, 'probability': prob_value, 'surname': surname}

In [65]:
# surname = input("Enter a surname: ")
classifier = classifier.to("cpu")
for surname in ['McMahan', 'Nakamoto', 'Wan', 'Cho', 'Chang', 'Vuc']:
    print(predict_nationality(surname, classifier, vectorizer))

{'nationality': 'Irish', 'probability': 0.2552226781845093, 'surname': 'McMahan'}
{'nationality': 'Japanese', 'probability': 0.793895959854126, 'surname': 'Nakamoto'}
{'nationality': 'Chinese', 'probability': 0.39525306224823, 'surname': 'Wan'}
{'nationality': 'Korean', 'probability': 0.3733985126018524, 'surname': 'Cho'}
{'nationality': 'Chinese', 'probability': 0.4022286534309387, 'surname': 'Chang'}
{'nationality': 'Vietnamese', 'probability': 0.44121870398521423, 'surname': 'Vuc'}
