# LLM Finetuning

Concepts:
- Industry Classification
- Unsloth APIs
- Performance Efficient Fine-tuning
- Supervised Fine-tuning


In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from tqdm import tqdm
from finds.database import SQL, RedisDB
from finds.unstructured import Edgar
from finds.structured import BusDay, CRSP, PSTAT
from finds.readers import Sectoring
from finds.utils import Store
from secret import credentials, paths
# %matplotlib qt

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
VERBOSE = 0
sql = SQL(**credentials['sql'], verbose=VERBOSE)
bd = BusDay(sql)
rdb = RedisDB(**credentials['redis'])
crsp = CRSP(sql, bd, rdb, verbose=VERBOSE)
pstat = PSTAT(sql, bd, verbose=VERBOSE)
ed = Edgar(paths['10X'], zipped=True, verbose=0)
store = Store(paths['scratch'], ext='pkl')

Last FamaFrench Date 2024-04-30 00:00:00


Load 10-K business description text for industry classification task


In [3]:
# Retrieve universe of stocks
univ = crsp.get_universe(bd.endmo(20221231))

In [4]:
# lookup company names
comnam = crsp.build_lookup(source='permno', target='comnam', fillna="")
univ['comnam'] = comnam(univ.index)

In [5]:
# lookup ticker symbols
ticker = crsp.build_lookup(source='permno', target='ticker', fillna="")
univ['ticker'] = ticker(univ.index)

In [6]:
# lookup sic codes from Compustat, and map to FF 10-sector code
sic = pstat.build_lookup(source='lpermno', target='sic', fillna=0)
industry = Series(sic[univ.index], index=univ.index)
industry = industry.where(industry > 0, univ['siccd'])
sectors = Sectoring(sql, scheme='codes10', fillna='')   # supplement from crosswalk
univ['sector'] = sectors[industry]
#fullnames = Series({'Durbl': "Durable", 'Enrgy': "Energy",
#                    'HiTec': "Technology", 'Hlth': "Healthcare",
#                    'Manuf': "Manufacturing", 'NoDur': "Nondurable",
#                    'Other': "Other", 'Shops': "Retail",
#                    'Telcm': "Telecommunications", 'Utils': "Utilities"})
#univ['sector'] = fullnames[sectors[industry]].values

In [7]:
# same permnos from other earlier experiments
permnos = list(store.load('nouns').keys())

In [8]:
# retrieve 2023 bus10K's
item, form = 'bus10K', '10-K'
rows = DataFrame(ed.open(form=form, item=item))
found = rows[rows['date'].between(20230101, 20231231)]\
             .drop_duplicates(subset=['permno'], keep='last')\
             .set_index('permno')\
             .reindex(permnos)

In [9]:
# split documents into train/test sets
labels = univ.loc[permnos, 'sector']
train_index, test_index = train_test_split(permnos,
                                           stratify=labels,
                                           random_state=42,
                                           test_size=0.2)
Series(labels).value_counts().rename('count').to_frame()

Unnamed: 0_level_0,count
sector,Unnamed: 1_level_1
Hlth,881
Other,762
HiTec,706
Manuf,344
Shops,321
Durbl,164
NoDur,145
Enrgy,94
Utils,92
Telcm,50


In [10]:
# helper to decode class label in text
class_labels = np.unique(labels)
def decode_label(text):
    """Extract label from output string text"""
    for lab in sorted(class_labels, key=lambda x: -len(x)): # longest strings first  
        if lab.lower() in text.lower():
            return lab
    return ""
print(len(labels), len(train_index), len(test_index), class_labels)

3559 2847 712 ['Durbl' 'Enrgy' 'HiTec' 'Hlth' 'Manuf' 'NoDur' 'Other' 'Shops' 'Telcm'
 'Utils']


## Unsloth APIs

Unsloth’s library uses several advanced techniques to make training large language models (LLMs) much faster and more efficient, for example, by optimizing matrix multiplications, which are a key part of LLM training, by chaining them together efficiently.

Installing unsloth:

`!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"`

`major_version, minor_version = torch.cuda.get_device_capability()` - must install separately since torch 2.2.1 breaks packages 

- if major_version >= 8 (new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40))

  `!pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes`

- else: older GPUs (V100, Tesla T4, RTX 20xx)

  `!pip install --no-deps xformers trl peft accelerate bitsandbytes`

```
WARNING[XFORMERS]: xFormers can't load C++/CUDA extensions. xFormers was built for:
PyTorch 2.3.0+cu121 with CUDA 1201 (you have 2.2.1+cu121)
Python  3.11.9 (you have 3.11.9)
Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
```

- install the corresponding xformers (https://anaconda.org/xformers/xformers/files)

```!pip3 install torch==2.2.1+cu121 --index-url https://download.pytorch.org/whl/cu121 --force-reinstall
   !pip install xformers==0.0.25  # for torch 2.2.1+cu121
```


In [11]:
NEW_MODEL = True
model_id ="unsloth/llama-3-8b-bnb-4bit"
savedir = str(paths['scratch'] / 'finetuned_model')
output_dir = str(paths['scratch'] / "outputs")
max_seq_length = 8192
dtype = None # None for auto. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [12]:
# 4bit pre quantized models from unsloth for 4x faster downloading + no OOMs
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id if NEW_MODEL else savedir,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # if using gated models like meta-llama/Meta-Llama-3-8B
)

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 3080 Laptop GPU. Max memory: 15.739 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Low-rank adaptation

Parameter-efficient fine-tuning (PEFT) methods significantly decrease the computational and storage costs for large language models by only fine-tuning a small number of (extra) model parameters instead of all the model's parameters. Low-rank adaptation (LoRA) reduces the number of trainable parameters by learning pairs of rank-decompostion matrices while freezing the original weights.

The `unsloth` library replaces HuggingFace's `peft`:

```
from peft import AutoPeftModelForCausalLM
model = AutoPeftModelForCausalLM.from_pretrained(
    savedir, # YOUR MODEL YOU USED FOR TRAINING
    load_in_4bit = load_in_4bit,
)

from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
model = get_peft_model(model, config)
```

In [13]:
# Add LoRA adapters so we only need to update less than 10% of all parameters
if NEW_MODEL:
    model = FastLanguageModel.get_peft_model(
        model,
        r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
        random_state = 3407,
        use_rslora = False,  # supports rank stabilized LoRA
        loftq_config = None, # supports LoftQ
    )

# [NOTE] To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).
# [NOTE] Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!


Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## HuggingFace datasets


In [14]:
EOS_TOKEN = tokenizer.eos_token   # Must add EOS_TOKEN
def generate_prompt(text, label=''):
    return f"""
Below is an instruction that describes a task. 
Write a response that appropriately completes the request.

### Instruction:
The text of a business description can be classified in industry categories like below.

"{'" or "'.join(class_labels)}".

In one word, please respond with the industry classification category 
of the following text delimited in triple quotes.

'''{text}'''

### Response: 
{label} """ + EOS_TOKEN

In [15]:
MAX_CHARS = 4096
def data_generator():
    for permno in train_index:
        text = ed[found.loc[permno, 'pathname']].replace('\n','').lower()[:MAX_CHARS]
        yield {"text": generate_prompt(text, label=univ.loc[permno, 'sector'])}
dataset = Dataset.from_generator(data_generator)

## Supervised fine-tuning

Supervised Fine-Tuning adapts a pre-trained model to a specific task, by training the model on a new labeled dataset to predict the correct label for each input.

The TRL library provides the SFTTrainer class, which is designed to facilitate the SFT process. This class accepts a column in your training dataset that contains the prompt constructed from system instructions, questions, and answers.

https://huggingface.co/docs/trl/sft_trainer

In [16]:
MAX_EPOCHS = 16
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,  # 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = -1, # 60, -1
        num_train_epochs = MAX_EPOCHS,  # default 3
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_strategy = "steps" if VERBOSE else "no",  # "epoch", "no", "steps"
        logging_steps = 1,  # defaults to 500 if logging_strategy="steps"
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        save_strategy="epoch",      # Save the model checkpoint every epoch
        output_dir = output_dir,
    ),
)

Map (num_proc=2): 100%|██████████| 2847/2847 [00:02<00:00, 1058.27 examples/s]


In [17]:
# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3080 Laptop GPU. Max memory = 15.739 GB.
5.594 GB of memory reserved.


In [18]:
# Helpers to generate input tokens and response
def generate_inputs(permno):
    """Transform text corresponding to permno into tokenized input"""
    text = ed[found.loc[permno, 'pathname']].replace('\n','').lower()[:MAX_CHARS]
    inputs = tokenizer([generate_prompt(text)], return_tensors="pt").to("cuda")
    return inputs

In [19]:
def generate_output(inputs):
    """Generate response given tokenized input, and return decoded output text"""
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    outputs = model.generate(
        **inputs,
        max_new_tokens=16,
        eos_token_id=terminators,
        pad_token_id=tokenizer.eos_token_id,
        #do_sample=True,
        #temperature=0.01, #0.6,
        #top_p=0.9,
        do_sample=False,
        use_cache=True,
    )
    response = outputs[0][len(inputs[0]):]
    return tokenizer.decode(response, skip_special_tokens=True)

In [20]:
def evaluate_output(permnos, verbose=VERBOSE):
    """Return predicted and true labels"""
    y_pred, y_true = [], []
    for permno in permnos:
        inputs = generate_inputs(permno)
        output = generate_output(inputs).replace('\n', ' ')
        label = decode_label(output)
        gold = univ.loc[permno, 'sector']
        y_pred.append(label)
        y_true.append(gold)
        if verbose:
            print(permno, gold, '  [', label, ']   ', output)
    return y_pred, y_true

In [21]:
# Evaluate before fine-tuning
if True:
    FastLanguageModel.for_inference(model)   # Enable native 2x faster inference
    y_pred, y_true = evaluate_output(test_index)

    # generate classification report
    report = metrics.classification_report(y_true=y_true, y_pred=y_pred)
    print(f"Classification Report (Test Set) before fine-tuning:")
    print(report)

Classification Report (Test Set) before fine-tuning:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         0
       Durbl       0.00      0.00      0.00        33
       Enrgy       0.00      0.00      0.00        19
       HiTec       0.00      0.00      0.00       141
        Hlth       0.50      0.01      0.01       176
       Manuf       0.12      0.06      0.08        69
       NoDur       0.01      0.03      0.02        29
       Other       0.00      0.00      0.00       153
       Shops       0.00      0.00      0.00        64
       Telcm       0.00      0.00      0.00        10
       Utils       0.00      0.00      0.00        18

    accuracy                           0.01       712
   macro avg       0.06      0.01      0.01       712
weighted avg       0.14      0.01      0.01       712



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Train the model

In [22]:
# Training loop
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,847 | Num Epochs = 16
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 5,696
 "-____-"     Number of trainable parameters = 41,943,040
100%|██████████| 5696/5696 [20:39:52<00:00, 13.06s/it]   

{'train_runtime': 74392.7458, 'train_samples_per_second': 0.612, 'train_steps_per_second': 0.077, 'train_loss': 0.494993617025654, 'epoch': 16.0}





In [23]:
# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

74392.7458 seconds used for training.
Peak reserved memory = 9.1 GB.
Peak reserved memory for training = 3.35 GB.
Peak reserved memory % of max memory = 57.818 %.
Peak reserved memory for training % of max memory = 21.285 %.


In [24]:
# Save fine-tuned model
# **[NOTE]** This ONLY saves the LoRA adapters, and not the full model.
if True:
    model.save_pretrained(savedir) # Local saving
    tokenizer.save_pretrained(savedir)
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...")

In [25]:
# Run inference on train split
if False:
    FastLanguageModel.for_inference(model)   # Enable native 2x faster inference
    y_pred, y_true = evaluate_output(train_index)

    # generate classification report
    report = metrics.classification_report(y_true=y_true, y_pred=y_pred)
    print(f"Classification Report (Training set):")
    print(report)

In [26]:
# Run inference on test split
if True:
    FastLanguageModel.for_inference(model)   # Enable native 2x faster inference
    y_pred, y_true = evaluate_output(test_index)

    # generate classification report
    report = metrics.classification_report(y_true=y_true, y_pred=y_pred)
    print(f"Classification Report (Test Set) after fine-tuning:")
    print(report)

100%|██████████| 712/712 [09:10<00:00,  1.29it/s]

Classification Report (Test Set) after fine-tuning:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         0
       Durbl       0.33      0.61      0.43        33
       Enrgy       1.00      0.47      0.64        19
       HiTec       0.74      0.59      0.66       141
        Hlth       0.96      0.89      0.92       176
       Manuf       0.70      0.71      0.71        69
       NoDur       0.70      0.55      0.62        29
       Other       0.80      0.70      0.75       153
       Shops       0.77      0.78      0.78        64
       Telcm       1.00      0.90      0.95        10
       Utils       0.65      0.94      0.77        18

    accuracy                           0.72       712
   macro avg       0.70      0.65      0.66       712
weighted avg       0.79      0.72      0.75       712




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Display a few responses from fine-tuned model

In [35]:
_ = evaluate_output(test_index[::32], verbose=1)


17147 Hlth   [ Hlth ]       Hlth 
22952 Other   [ Other ]     Other 
14547 HiTec   [ HiTec ]     HiTec 
88284 Hlth   [ Hlth ]       Hlth 
88873 HiTec   [ Other ]     Hi there,   Other has the highest weight so that's the one
13777 HiTec   [ Other ]     Hi there,   Other 
89968 Other   [ Other ]     Other 
16471 Other   [ Other ]     Other 
21933 Hlth   [ Hlth ]       Hlth 
90542 Hlth   [ Hlth ]       Hlth 
22260 Manuf   [ NoDur ]     NoDur 
16401 Enrgy   [ Enrgy ]     Enrgy 
14329 Shops   [ Shops ]     Shops 
79698 Hlth   [  ]     Hi there,   I'm a business description writer, and based on
92443 Other   [ Other ]     Other 
19085 Other   [  ]     Bank 
86810 Other   [ NoDur ]     NoDur 
18960 Hlth   [ Hlth ]       Hlth 
15943 HiTec   [  ]     Hi there,   The industry classification category of our business description can be
17106 HiTec   [ HiTec ]     HiTec 
76076 HiTec   [ HiTec ]     HiTec 
20412 Manuf   [ Manuf ]     Manuf 
13812 HiTec   [ HiTec ]     HiTec 
