# Installs

In [None]:
!pip install --upgrade torch torchvision torchaudio --quiet

In [None]:
!pip install --upgrade transformers --quiet

In [None]:
!pip install -i https://pypi.org/simple/ bitsandbytes --upgrade --quiet

In [None]:
!pip install accelerate --upgrade --quiet

In [None]:
!pip install datasets --quiet

In [None]:
!pip install PyMuPDF pdfminer.six --quiet

In [None]:
!pip install peft --quiet

In [None]:
!pip install trl --quiet

# Imports

In [None]:
import bitsandbytes
import accelerate

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [None]:
from huggingface_hub import notebook_login
from google.colab import drive, userdata

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from pdfminer.high_level import extract_text
import glob

In [None]:
import torch
import gc

In [None]:
from functools import partial
from datasets import Dataset
import pandas as pd
import os
import re

In [None]:
from IPython.display import display, Markdown

# HuggingFace/Drive integration

In [None]:
notebook_login()

In [None]:
drive.mount('/content/drive')

# GPU

In [None]:
# Check device availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Free GPU memory

In [None]:
def free_gpu_memory():
  gc.collect()
  torch.cuda.empty_cache()

# Load LLM

In [None]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
auth_token = userdata.get('HF_TOKEN')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token = auth_token,
    cache_dir = '/content/drive/MyDrive/model',
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token = auth_token,
    cache_dir = '/content/drive/MyDrive/model',
    torch_dtype = torch.float16,
    rope_scaling = {"type": "dynamic", "factor": 2},
    low_cpu_mem_usage = True,
    device_map = "auto",
    quantization_config = quantization_config
)

# Phase 1 — PEFT

In [None]:
def extract_text_from_pdfs(pdf_paths):
    texts = []
    for path in pdf_paths:
        text = extract_text(path)
        texts.append(text)
    return texts

In [None]:
pdf_path = "/content/drive/MyDrive/data"

In [None]:
pdf_paths = glob.glob(pdf_path+"/*.pdf")

In [None]:
texts = extract_text_from_pdfs(pdf_paths)

In [None]:
def clean_text(text):
    # Remove header/footer artifacts
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple whitespaces with single space
    text = re.sub(r'(\n){2,}', '\n', text)  # Replace multiple newlines with a single newline
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters

    # Remove common but unnecessary items like references or excess newlines
    text = text.replace('\n', ' ')  # Replace new lines with space to maintain continuity
    return text

In [None]:
texts = [clean_text(text) for text in texts]

In [None]:
def chunk_text(text, chunk_size = 512, overlap = 50):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
    return chunks

In [None]:
# Define tokenize function
def tokenize_function(examples):
    all_chunks = []
    for example in examples['text']:
        chunks = chunk_text(example)
        for chunk in chunks:
            tokenized_chunk = tokenizer(chunk, padding = "max_length", truncation = True, max_length = 512)
            all_chunks.append(tokenized_chunk)

    # Transform list of tokenized chunks into a dictionary of lists
    batch = {key: [] for key in all_chunks[0].keys()}
    for chunk in all_chunks:
        for key, value in chunk.items():
            batch[key].append(value)

    return batch

In [None]:
# Create a dataset from the extracted texts
texts_dataset = Dataset.from_dict({"text": texts})
tokenized_dataset = texts_dataset.map(tokenize_function, batched = True, remove_columns=["text"])

In [None]:
# Add labels (in causal language modeling, labels are the same as input_ids)
def add_labels(example):
    example['labels'] = example['input_ids'].copy()
    return example

tokenized_dataset = tokenized_dataset.map(add_labels, batched=False)

## Phase 1 fine tuning

In [None]:
# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
def find_all_linear_names(model):
    cls = bitsandbytes.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')

    return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)

In [None]:
# Define the LoRA configuration
lora_config_phase1 = LoraConfig(
    r = 16,  # rank of the low-rank approximation
    lora_alpha = 64,  # scaling factor
    target_modules = modules,  # target specific modules
    lora_dropout = 0.1,  # dropout rate
    bias = "none",  # whether to train biases
    task_type = "CAUSAL_LM"
)

In [None]:
# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
model.gradient_checkpointing_enable()

In [None]:
# 2 - Using the prepare_model_for_kbit_training method from PEFT
model = prepare_model_for_kbit_training(model)

In [None]:
model = get_peft_model(model, lora_config_phase1)

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    warmup_steps = 2,
    max_steps = 15,
    learning_rate = 2e-4,
    fp16 = True,
    logging_steps = 1,
    output_dir = "outputs",
    optim = "paged_adamw_8bit",
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset,
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
model.config.use_cache = False

In [None]:
train_result = trainer.train()

In [None]:
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
print(metrics)

In [None]:
output_dir = '/content/drive/MyDrive/saved_models/phase1'
trainer.model.save_pretrained(output_dir)

In [None]:
del model
del trainer
torch.cuda.empty_cache()

In [None]:
model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
model = model.merge_and_unload()

In [None]:
output_merged_dir = "/content/drive/MyDrive/saved_models/LLama2-7B-chat-PT1"
model.save_pretrained(
    output_merged_dir,
    safe_serialization = True
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)

In [None]:
print(',\n'.join(os.listdir("/content/drive/MyDrive/saved_models/LLama2-7B-chat-PT1")))

## Phase 1 Testing

In [None]:
model.eval()
print("Model loaded successfully.")

In [None]:
# Define a system prompt to guide the responses of the chatbot
system_prompt = """You are a helpful and informative assistant called "Assistant". Your goal is to provide accurate and relevant information to the user's queries.
Please ensure that your responses are succinct, respectful, and factual. Refrain from emoting.
If you're uncertain about a question, it's better to admit it rather than provide inaccurate information.
Respond to the User's question ONLY. Do not impersonate the User and do not include followup questions in your response unless prompted."""

In [None]:
prompt = "What is the concept of plaque-years in the context of LDL cholesterol and cardiovascular health?"

prompt_with_system_prompt = f"{system_prompt}\nUser: {prompt} Assistant: "  # Add the system prompt to the beginning of the conversation

inputs = tokenizer(prompt_with_system_prompt, return_tensors = "pt").to(device)

outputs = model.generate(
    **inputs,
    #max_length = 150,
    temperature = 0.5,
    top_p = 0.75
)

In [None]:
response_text = tokenizer.decode(outputs[0], skip_special_tokens = True)
display(Markdown(f"\n'''\n{response_text.split('Assistant: ')[-1].strip()}\n'''\n"))

# Phase 2 — Instruction FT

In [None]:
phase2_qs = pd.read_csv('/content/drive/MyDrive/data/csv/phase2_questions_filled.csv')

In [None]:
phase2_qs_ds = Dataset.from_pandas(phase2_qs)

In [None]:
print(f'Number of prompts: {len(phase2_qs_ds)}')
print(f'Column names are: {phase2_qs_ds.column_names}')

In [None]:
nb_samples = 3
random_indices = random.sample(range(len(phase2_qs_ds)), nb_samples)
samples = []

for idx in random_indices:
    sample = phase2_qs_ds[idx]

    sample_data = {
        'instruction': sample['instruction'],
        'response': sample['response'],
    }

    samples.append(sample_data)

In [None]:
df = pd.DataFrame(samples)
display(df)

In [None]:
def create_prompt_formats(sample):
    instruction = f"User: {sample['instruction']}"
    response = f"Assistant: {sample['response']}"

    parts = [part for part in [instruction, response] if part]

    formatted_prompt = "\n".join(parts)

    sample["text"] = formatted_prompt

    return sample

In [None]:
# Load tokenizer and model
model_directory = "/content/drive/MyDrive/saved_models/LLama2-7B-chat-PT1"

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_directory,
    local_files_only = True
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16,
)

# Load the fine-tuned model from phase1
model = AutoModelForCausalLM.from_pretrained(
    model_directory,
    torch_dtype = torch.float16,
    low_cpu_mem_usage = True,
    rope_scaling = {"type": "dynamic", "factor": 2},
    local_files_only = True,
    quantization_config = quantization_config
)

In [None]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")

    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    return tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )

In [None]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer, max_length, dataset):
    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = ["instruction", "response", "text"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle()

    return dataset

In [None]:
max_length = get_max_length(model)
phase2_qs_ds = preprocess_dataset(tokenizer, max_length, phase2_qs_ds)

In [None]:
# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
def find_all_linear_names(model):
    cls = bitsandbytes.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')

    return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)

In [None]:
# Define the LoRA configuration
lora_config_phase2 = LoraConfig(
    r = 4,  # rank of the low-rank approximation - lower for phase 2
    lora_alpha = 16,  # scaling factor
    target_modules = modules,  # target specific modules
    lora_dropout = 0.1,  # dropout rate
    bias = "none",  # whether to train biases
    task_type = "CAUSAL_LM"
)

In [None]:
# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
model.gradient_checkpointing_enable()

In [None]:
# 2 - Using the prepare_model_for_kbit_training method from PEFT
model = prepare_model_for_kbit_training(model)

In [None]:
model = get_peft_model(model, lora_config_phase2)

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 2,
    warmup_steps = 2,
    max_steps = 10,
    learning_rate = 1e-4,
    fp16 = True,
    logging_steps = 1,
    output_dir = "outputs",
    optim = "paged_adamw_8bit",
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = phase2_qs_ds,
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
model.config.use_cache = False

In [None]:
train_result = trainer.train()

In [None]:
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
print(metrics)

In [None]:
output_dir = '/content/drive/MyDrive/saved_models/phase2'
trainer.model.save_pretrained(output_dir)

In [None]:
del model
del trainer
torch.cuda.empty_cache()

In [None]:
model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
model = model.merge_and_unload()

In [None]:
output_merged_dir = "/content/drive/MyDrive/saved_models/LLama2-7B-chat-PT2-v2"
model.save_pretrained(
    output_merged_dir,
    safe_serialization=True
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)

In [None]:
print(',\n'.join(os.listdir("/content/drive/MyDrive/saved_models/LLama2-7B-chat-PT2-v2")))

# Phase 2 Testing

In [None]:
model.eval()
print("Model loaded successfully.")

In [None]:
# Define a system prompt to guide the responses of the chatbot
system_prompt = """You are a helpful and informative assistant called "Assistant". Your goal is to provide accurate and relevant information to the user's queries.
Please ensure that your responses are succinct, respectful, and factual. Refrain from emoting.
If you're uncertain about a question, it's better to admit it rather than provide inaccurate information.
Respond to the User's question ONLY. Do not impersonate the User and do not include followup questions in your response unless prompted."""

In [None]:
prompt = "What is the concept of plaque-years in the context of LDL cholesterol and cardiovascular health?"

prompt_with_system_prompt = f"{system_prompt}\nUser: {prompt}\nAssistant: "

inputs = tokenizer(prompt_with_system_prompt, return_tensors="pt").to(device)

outputs = model.generate(
    input_ids = inputs["input_ids"].to(device),
    attention_mask = inputs["attention_mask"],
    pad_token_id=tokenizer.eos_token_id,
    # max_new_tokens = 512,
    temperature = 0.5,
    top_p = 0.75
)

In [None]:
response_text = tokenizer.decode(outputs[0], skip_special_tokens = True)
display(Markdown(f"\n'''\n{response_text.split('Assistant: ')[-1].strip()}\n'''\n"))