In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/model-training-for-medical-assistant/__results__.html
/kaggle/input/model-training-for-medical-assistant/__notebook__.ipynb
/kaggle/input/model-training-for-medical-assistant/__output__.json
/kaggle/input/model-training-for-medical-assistant/custom.css
/kaggle/input/model-training-for-medical-assistant/llama-3-8b-medical-assistant/adapter_model.safetensors
/kaggle/input/model-training-for-medical-assistant/llama-3-8b-medical-assistant/adapter_config.json
/kaggle/input/model-training-for-medical-assistant/llama-3-8b-medical-assistant/README.md
/kaggle/input/model-training-for-medical-assistant/llama-3-8b-medical-assistant/checkpoint-3500/adapter_model.safetensors
/kaggle/input/model-training-for-medical-assistant/llama-3-8b-medical-assistant/checkpoint-3500/trainer_state.json
/kaggle/input/model-training-for-medical-assistant/llama-3-8b-medical-assistant/checkpoint-3500/training_args.bin
/kaggle/input/model-training-for-medical-assistant/llama-3-8b-medical-assistant/checkp

#### Tuned model merging.

We have already trained the model, and now it should be merged with the base model to be saved and used in the project.

In [2]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

Logging in to Hugging Face.

In [3]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("huggingface_token")
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Creating variables for base and new models.

In [4]:
base_model = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"
new_model = "/kaggle/input/model-training-for-medical-assistant/llama-3-8b-medical-assistant/"

To merge the base model with the adapter we should load the tokenizer and base model using the transformers library. Then, we set up the chat format using the trl library. Finally, we load and merge the adapter to the base model using the PEFT library.

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format

# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

2024-07-20 09:23:29.777451: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-20 09:23:29.777561: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-20 09:23:29.903199: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
torch.cuda.empty_cache()

In [7]:
# Merge adapter with base model
base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)
model = PeftModel.from_pretrained(base_model_reload, new_model)

In [8]:
model = model.merge_and_unload()

#### Model Inference.

To verify if our model has been merged correctly, we perform a simple inference using pipeline from the transformers library. We convert the message using the chat template and then provide a prompt to the pipeline. The pipeline was initialized using the model, tokenizer, and task type.

In [9]:
messages = [{"role": "user", "content": "Hello doctor, I have bad acne. How do I get rid of it?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

<|im_start|>user
Hello doctor, I have bad acne. How do I get rid of it?<|im_end|>
<|im_start|>assistant
                                                                 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	


We can see that the fine-tuned model is working as expected after being merged.

#### Saving and pushing the merged model.

In [10]:
model.save_pretrained("llama-3-8b-medical-assistant")
tokenizer.save_pretrained("llama-3-8b-medical-assistant")

('llama-3-8b-medical-assistant/tokenizer_config.json',
 'llama-3-8b-medical-assistant/special_tokens_map.json',
 'llama-3-8b-medical-assistant/tokenizer.json')

In [11]:
model.push_to_hub("llama-3-8b-medical-assistant", use_temp_dir=False)
tokenizer.push_to_hub("llama-3-8b-medical-assistant", use_temp_dir=False)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/McSimoff/llama-3-8b-medical-assistant/commit/25ada26a4a949e75409172f54fe621ea3109e5a6', commit_message='Upload tokenizer', commit_description='', oid='25ada26a4a949e75409172f54fe621ea3109e5a6', pr_url=None, pr_revision=None, pr_num=None)