# Installs

In [None]:
!pip install --upgrade torch torchvision torchaudio --quiet

In [None]:
!pip install --upgrade transformers --quiet

In [None]:
!pip install -i https://pypi.org/simple/ bitsandbytes --upgrade --quiet

In [None]:
!pip install accelerate --upgrade --quiet

In [None]:
!pip install datasets --quiet

In [None]:
!pip install PyMuPDF pdfminer.six --quiet

In [None]:
!pip install peft --quiet

In [None]:
!pip install trl --quiet

# Imports

In [None]:
import bitsandbytes
import accelerate

In [None]:
import torch

In [None]:
import gc

In [None]:
import os

In [None]:
from huggingface_hub import notebook_login

In [None]:
from datasets import Dataset

# HuggingFace/Drive integration

In [None]:
notebook_login()

In [None]:
from google.colab import drive, userdata
drive.mount('/content/drive')

# GPU

In [None]:
# Check device availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Free GPU memory

In [None]:
def free_gpu_memory():
  gc.collect()
  torch.cuda.empty_cache()

# Load LLM

In [None]:
from transformers import AutoTokenizer

In [None]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
auth_token = userdata.get('HF_TOKEN')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=auth_token,
    cache_dir = '/content/drive/MyDrive/model',
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=auth_token,
    cache_dir = '/content/drive/MyDrive/model',
    torch_dtype=torch.float16,
    rope_scaling={"type": "dynamic", "factor": 2},
    low_cpu_mem_usage=True,
    device_map = "auto",
    quantization_config=quantization_config
)

# Phase 1

In [None]:
from pdfminer.high_level import extract_text

In [None]:
def extract_text_from_pdfs(pdf_paths):
    texts = []
    for path in pdf_paths:
        text = extract_text(path)
        texts.append(text)
    return texts

In [None]:
import glob
pdf_path = "/content/drive/MyDrive/data"

In [None]:
pdf_paths = glob.glob(pdf_path+"/*.pdf")

In [None]:
texts = extract_text_from_pdfs(pdf_paths)

In [None]:
import re

def clean_text(text):
    # Remove header/footer artifacts
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple whitespaces with single space
    text = re.sub(r'(\n){2,}', '\n', text)  # Replace multiple newlines with a single newline
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters

    # Remove common but unnecessary items like references or excess newlines
    text = text.replace('\n', ' ')  # Replace new lines with space to maintain continuity
    return text

In [None]:
texts = [clean_text(text) for text in texts]

In [None]:
def chunk_text(text, chunk_size = 512, overlap = 50):
  tokens = tokenizer.tokenize(text)
  chunks = []
  for i in range(0, len(tokens), chunk_size - overlap):
    chunk = tokens[i:i + chunk_size]
    chunks.append(tokenizer.convert_tokens_to_string(chunk))
  return chunks

In [None]:
# Define tokenize function
def tokenize_function(examples):
  all_chunks = []
  for example in examples['text']:
    chunks = chunk_text(example)
    for chunk in chunks:
      tokenized_chunk = tokenizer(chunk, padding="max_length", truncation=True, max_length=512)
      all_chunks.append(tokenized_chunk)

  # Transform list of tokenized chunks into a dictionary of lists
  batch = {key: [] for key in all_chunks[0].keys()}
  for chunk in all_chunks:
    for key, value in chunk.items():
      batch[key].append(value)
  return batch

In [None]:
# Create a dataset from the extracted texts
texts_dataset = Dataset.from_dict({"text": texts})
tokenized_dataset = texts_dataset.map(tokenize_function, batched = True, remove_columns=["text"])

In [None]:
# Add labels (in causal language modeling, labels are the same as input_ids)
def add_labels(example):
    example['labels'] = example['input_ids'].copy()
    return example

tokenized_dataset = tokenized_dataset.map(add_labels, batched=False)

## Phase 1 fine tuning

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM

In [None]:
# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
def find_all_linear_names(model):
    cls = bitsandbytes.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)

In [None]:
# Define the LoRA configuration
lora_config_phase1 = LoraConfig(
    r = 16,  # rank of the low-rank approximation
    lora_alpha = 64,  # scaling factor
    target_modules = modules,  # target specific modules
    lora_dropout = 0.1,  # dropout rate
    bias = "none",  # whether to train biases
    task_type = "CAUSAL_LM"
)

In [None]:
# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
model.gradient_checkpointing_enable()

In [None]:
# 2 - Using the prepare_model_for_kbit_training method from PEFT
model = prepare_model_for_kbit_training(model)

In [None]:
model = get_peft_model(model, lora_config_phase1)

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    warmup_steps = 2,
    max_steps = 15,
    learning_rate = 2e-4,
    fp16 = True,
    logging_steps = 1,
    output_dir = "outputs",
    optim = "paged_adamw_8bit",
)

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset,
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
model.config.use_cache = False

In [None]:
train_result = trainer.train()

In [None]:
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
print(metrics)

In [None]:
output_dir = '/content/drive/MyDrive/saved_models/phase1'
trainer.model.save_pretrained(output_dir)

In [None]:
del model
del trainer
torch.cuda.empty_cache()

In [None]:
model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
model = model.merge_and_unload()

In [None]:
output_merged_dir = "/content/drive/MyDrive/saved_models/LLama2-7B-chat-PT1"
model.save_pretrained(
    output_merged_dir,
    safe_serialization=True
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)

In [None]:
print(',\n'.join(os.listdir("/content/drive/MyDrive/saved_models/LLama2-7B-chat-PT1")))

## Phase 1 Testing

In [None]:
model.eval()
print("Model loaded successfully.")

In [None]:
from IPython.display import display, Markdown

In [None]:
# Define a system prompt to guide the responses of the chatbot
system_prompt = """You are a helpful and informative assistant called "Assistant". Your goal is to provide accurate and relevant information to the user's queries.
Please ensure that your responses are succinct, respectful, and factual. If you're uncertain about a question, it's better to admit it rather than provide inaccurate information."""

In [None]:
prompt = "What is the concept of plaque-years in the context of LDL cholesterol and cardiovascular health?"

prompt_with_system_prompt = f"{system_prompt}\nUser: {prompt} Assistant: "  # Add the system prompt to the beginning of the conversation

inputs = tokenizer(prompt_with_system_prompt, return_tensors="pt").to(device)

outputs = model.generate(
    **inputs,
    #max_length=150,
    temperature=0.5,
    top_p=0.75
)

response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

display(Markdown(f"\n'''\n{response_text.split('Assistant: ')[-1].strip()}\n'''\n"))