<h1 align="center">
  <a href="https://uptrain.ai">
    <img width="300" src="https://user-images.githubusercontent.com/108270398/214240695-4f958b76-c993-4ddd-8de6-8668f4d0da84.png" alt="uptrain">
  </a>
</h1>

<h1 style="text-align: center;">Automated LLM Finetuning with UpTrain</h1>

In [1]:
# !pip install transformers weightwatcher

In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, BertPreTrainedModel
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from typing import List, Optional, Tuple, Union

from tqdm import trange
import random
import weightwatcher as ww
import uptrain

import warnings
warnings.filterwarnings("ignore")

PyTorch is available but CUDA is not. Defaulting to SciPy for SVD


#### Download the SPAM collection Dataset

In [3]:
# !wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'

In [4]:
# !unzip -o smsspamcollection.zip

In [5]:
file_path = './SMSSpamCollection'
df = pd.DataFrame({'label':int(), 'text':str()}, index = [])
with open(file_path) as f:
    for line in f.readlines():
        split = line.split('\t')
        df = df.append({'label': 1 if split[0] == 'spam' else 0,
                    'text': split[1]},
                    ignore_index = True)
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...\n
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


#### Define helper functions

In [6]:
def preprocessing(input_text, tokenizer):
    '''
    Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered 
    by the model (return_attention_mask = True).
    '''
    return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

def get_train_val_dataloader(df, train_idx, val_idx, fraction=1):
    text = df.text.values
    labels = df.label.values
    truncate_dataset = False

    if truncate_dataset:
        text = text[0:int(len(text) * fraction)]
        labels = labels[0:int(len(text) * fraction)]

    tokenizer = BertTokenizer.from_pretrained(
      'bert-base-uncased',
      do_lower_case = True
      )
    token_id = []
    attention_masks = []

    for sample in text:
        encoding_dict = preprocessing(sample, tokenizer)
        token_id.append(encoding_dict['input_ids']) 
        attention_masks.append(encoding_dict['attention_mask'])


    token_id = torch.cat(token_id, dim = 0)
    attention_masks = torch.cat(attention_masks, dim = 0)
    labels = torch.tensor(labels)

    val_ratio = 0.2
    batch_size = 16
    # train_idx, val_idx = train_test_split(
    #     np.arange(len(labels)),
    #     test_size = val_ratio,
    #     shuffle = True,
    #     stratify = labels)

    train_idx = train_idx[0:int(len(text) * fraction * (1 - val_ratio) )]

    train_set = TensorDataset(token_id[train_idx], 
                            attention_masks[train_idx], 
                            labels[train_idx])

    val_set = TensorDataset(token_id[val_idx], 
                          attention_masks[val_idx], 
                          labels[val_idx])

    train_dataloader = DataLoader(
              train_set,
              sampler = RandomSampler(train_set),
              batch_size = batch_size
          )

    validation_dataloader = DataLoader(
              val_set,
              sampler = SequentialSampler(val_set),
              batch_size = batch_size
          )
    return train_dataloader, validation_dataloader

In [7]:
import torch.nn as nn
from transformers import BertModel
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.modeling_outputs import SequenceClassifierOutput

In [8]:
class BertForSequenceClassificationWithIntermediateLayer(BertPreTrainedModel):
  '''
  Extends the BertForSequenceClassification class to return the logits of the intermediate layer.
  '''
  def __init__(self, config):
    super().__init__(config)
    # import pdb; pdb.set_trace()
    self.num_labels = config.num_labels
    self.config = config

    self.bert = BertModel(config)
    classifier_dropout = (
        config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
    )
    self.dropout = nn.Dropout(classifier_dropout)
    self.post_bert = nn.Linear(config.hidden_size, config.hidden_size)
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    self.relu = nn.ReLU()
    # Initialize weights and apply final processing
    self.post_init()

  def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        intermediate_output = self.post_bert(pooled_output)
        intermediate_output = self.relu(intermediate_output)
        intermediate_output = self.dropout(intermediate_output)
        logits = self.classifier(intermediate_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [9]:
model = BertForSequenceClassificationWithIntermediateLayer.from_pretrained('bert-base-uncased', 
                                                                           num_labels = 2, 
                                                                           output_attentions = False, 
                                                                           output_hidden_states = False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device == 'cpu':
    # set device to mps if available (for mac)
    device = "mps" if torch.backends.mps.is_available() else "cpu"
device = torch.device(device)
model.to(device)
optimizer = torch.optim.AdamW(
                        model.parameters(), 
                          lr = 5e-5,
                          eps = 1e-08
                          )
optimizer_type = 'AdamW'
optimizer_base_lr = 2e-5

val_ratio = 0.2
labels = df.label.values
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)
train_dataloader, val_dataloader = get_train_val_dataloader(df,train_idx, val_idx, fraction=1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassificationWithIntermediateLayer: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassificationWithIntermediateLayer from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassificationWithIntermediateLayer from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceC

In [10]:
print("Training data size: " , len(train_dataloader))
print("Validation data size: " , len(val_dataloader))

Training data size:  279
Validation data size:  70


### Define the UpTrain Config 

In [11]:
config = {"checks": [
    {'type': uptrain.Statistic.FINETUNE,
    'optimizer': optimizer,
    'dataloader': train_dataloader,
    'layers': [230],
    'customLR': True,
    'base_lr': 2e-5,
    }
    ],
    "logging_args": {"st_logging": True,
                    "log_data": False,
                    },
}

In [12]:
framework = uptrain.Framework(cfg_dict=config)

Deleting the folder:  uptrain_smart_data
Deleting the folder:  uptrain_logs


In [None]:
framework.log(inputs={'model': [model]})


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8501
  Network URL: http://192.168.6.64:8501



In [None]:
# Set model to evaluation mode
model.eval()

# Tracking variables 
val_accuracy = []
val_precision = []
val_recall = []
val_specificity = []

nb_val_examples, nb_val_steps = 0, 0

for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        # Forward pass
        eval_output = model(b_input_ids.to(device), 
                            token_type_ids = None, 
                            attention_mask = b_input_mask.to(device),
                                labels = b_labels.to(device)
                            )
    val_loss += eval_output.loss.item()
    nb_val_examples += b_input_ids.size(0)
    nb_val_steps += 1

    logits = eval_output.logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    # Calculate validation metrics
    b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
    val_accuracy.append(b_accuracy)
    # Update precision only when (tp + fp) !=0; ignore nan
    if b_precision != 'nan': val_precision.append(b_precision)
    # Update recall only when (tp + fn) !=0; ignore nan
    if b_recall != 'nan': val_recall.append(b_recall)
    # Update specificity only when (tn + fp) !=0; ignore nan
    if b_specificity != 'nan': val_specificity.append(b_specificity)

print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
print('\t - Validation loss: {:.4f}'.format(val_loss / nb_val_steps))
print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) 
      if len(val_precision)>0 else '\t - Validation Precision: NaN')
print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) 
      if len(val_recall)>0 else '\t - Validation Recall: NaN')
print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) 
      if len(val_specificity)>0 else '\t - Validation Specificity: NaN')