<a href="https://colab.research.google.com/github/nsanghi/IndiaLLM/blob/main/llm_pipeline_v01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains a set of functions which can be called in a sequence with specific params to finetune a specific model from huggingface (HF) repository on a specific dataset from HF.

Loading data from other data sources like google drive is to be implemented.

The trained model can be saved to HF repo under the logged in user profile.

In [None]:
# utility functions
def check_gpu_status():
    # get gpu info
    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Not connected to a GPU')
    else:
        print(gpu_info)

check_gpu_status()

Fri Jul 14 19:08:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# !pip install transformers
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/101.8 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m115.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m21.0 MB

In [None]:
from huggingface_hub import notebook_login

# need access_token with write permission
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# huggingface model config, where we define the constants required for this training job
model_source = 'HF' # HF=Huggingface

if model_source == 'HF':
    ref_model_id = "facebook/opt-350m"
    ref_model_type = "AutoModelForCausalLM"
    datastet_id = "satpalsr/indicCorpv2"
    dataset_lang = "bn"
    repo_user = "tanmoy-in"
    model_name = "base_model"

else:
    print('please select valid model source')


In [None]:
def hf_create_repo(user:str, model_name:str):
    '''
    this will create a model under the huggingface user currently logged in
    '''

    from huggingface_hub import create_repo

    try:
        repo = f"{user}/{model_name}"
        create_repo(repo, private=False, exist_ok=True)

        return repo

    except Exception as ex:
        print(f'error while creating repo. {ex}')

hf_create_repo(user=repo_user, model_name=model_name)

'tanmoy-in/base_model'

In [None]:
def hf_prepare_model_for_qlora(model_id:str='facebook/opt-350m', model_type:str='AutoModelForCausalLM', lora_target_modules:list=["k_proj","v_proj"]):
    '''
    returns the model with required config for QLora
    '''
    import torch
    from transformers import AutoModelForCausalLM
    from transformers  import BitsAndBytesConfig
    from peft import prepare_model_for_kbit_training
    from peft import LoraConfig, get_peft_model

    def print_trainable_parameters(model):
        """
        Prints the number of trainable parameters in the model.
        """
        trainable_params = 0
        all_param = 0
        for _, param in model.named_parameters():
            all_param += param.numel()
            if param.requires_grad:
                trainable_params += param.numel()
        print(
            f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
        )

    #
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    if model_type == 'AutoModelForCausalLM':
        model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

        # peft based preparation
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)

        # lora based preparation
        config = LoraConfig(
            r=8,
            lora_alpha=32,
            target_modules=lora_target_modules,
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )
        model = get_peft_model(model, config)
        print_trainable_parameters(model)

    else:
        print(f'model_type={model_type} is not defined, setting model as None')
        model = None


    return model

In [None]:
# TODO: option to remove columns when necessary
# TODO: streaming data option

def hf_prepare_data(datastet_id:str, tokenizer, sample_size:int=-1, data_source:str='HF', target_text_column:str='text'):
    '''
    pre
    '''
    from datasets import load_dataset


    if data_source == 'HF':
        data = load_dataset(datastet_id)
        if sample_size != -1:
            data = data['train'].select(range(sample_size))

        data = data.map(lambda samples: tokenizer(samples[target_text_column]), batched=True)

        # TODO: print token count
    elif data_source == "GD":
        # TODO: implement data loading from google drive
        print('data loading from google drive is not supported yet')
    else:
        print(f'data_source={data_source} is not supported yet')

    return data


In [None]:
def hf_get_tokenizer(tokenizer_id:str, tokenizer_type:str="auto"):
    '''
    '''
    from transformers import AutoTokenizer

    if tokenizer_type == "auto":
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
    else:
        tokenizer = None
        print(f'tokenizer_type={tokenizer_type} is not supported yet, so returning token as None')

    return tokenizer

In [None]:
# TODO: get total number of tokens in a dataset after tokenization
#

In [None]:
# TODO: keyword arguement


def hf_llm_train(model, tokenizer, data, save_model:bool=True, save_model_name:str=""):
    import transformers

    tokenizer.pad_token = tokenizer.eos_token

    trainer = transformers.Trainer(
        model=model,
        train_dataset=data,
        args=transformers.TrainingArguments(
            output_dir=save_model_name,
            push_to_hub=True,
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            max_steps=5,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=1,
            optim="paged_adamw_8bit"
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )
    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
    trainer.train()

    if save_model:
        # push the model to HF hub
        trainer.push_to_hub()
        # model.push_to_hub("test_model")

    return model


In [None]:
model = hf_prepare_model_for_qlora(model_id=ref_model_id, model_type=ref_model_type, lora_target_modules=["k_proj","v_proj"])
tokenizer = hf_get_tokenizer(tokenizer_id=ref_model_id)
data = hf_prepare_data(datastet_id=datastet_id, tokenizer=tokenizer, sample_size=1000)

trained_model = hf_llm_train(model=model, tokenizer=tokenizer, data=data, save_model=True, save_model_name=model_name)

Some weights of OPTForCausalLM were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 786432 || all params: 180463616 || trainable%: 0.43578424140631206


Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/satpalsr___parquet/satpalsr--indicCorpv2-2cebd3321730503a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/170M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/171M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/170M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3375484 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/satpalsr___parquet/satpalsr--indicCorpv2-2cebd3321730503a/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Cloning https://huggingface.co/tanmoy-in/base_model into local empty directory.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.0853
2,2.6788
3,1.5244
4,2.8887
5,1.5274


Upload file adapter_model.bin:   1%|1         | 32.0k/3.03M [00:00<?, ?B/s]

Upload file training_args.bin: 100%|##########| 3.87k/3.87k [00:00<?, ?B/s]

Upload file runs/Jul14_19-15-15_638d2a56917f/events.out.tfevents.1689362121.638d2a56917f.488.0: 100%|#########…

To https://huggingface.co/tanmoy-in/base_model
   11b4a41..9cc5b6c  main -> main

   11b4a41..9cc5b6c  main -> main

To https://huggingface.co/tanmoy-in/base_model
   9cc5b6c..56e42eb  main -> main

   9cc5b6c..56e42eb  main -> main



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OPTForCausalLM(
      (model): OPTModel(
        (decoder): OPTDecoder(
          (embed_tokens): Embedding(50272, 512, padding_idx=1)
          (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
          (project_out): Linear4bit(in_features=1024, out_features=512, bias=False)
          (project_in): Linear4bit(in_features=512, out_features=1024, bias=False)
          (layers): ModuleList(
            (0-23): 24 x OPTDecoderLayer(
              (self_attn): OPTAttention(
                (k_proj): Linear4bit(
                  in_features=1024, out_features=1024, bias=True
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1024, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Line