In [1]:
# ! pip -q install bitsandbytes accelerate xformers einops peft datasets
# ! pip -q install --upgrade huggingface_hub
# ! pip -q install PyPDF2 pycryptodome==3.15.0 sentencepiece

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import torch
import transformers
import re, random
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM

In [4]:
# !nvidia-smi

In [5]:
# ! pip install bitsandbytes --prefer-binary --force-reinstall

In [6]:
# ! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

In [None]:
from dotenv import load_dotenv
import os

model_id = "mistralai/Mistral-7B-Instruct-v0.1"
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

load_dotenv('../.env')  # Load environment variables from .env file
hf_api_key = os.getenv("HF_API_KEY_HOME")
login(token=hf_api_key)  # HF HOME

model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
quantization_config=bnb_config,
device_map='auto',
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id,
padding_side="left",
add_eos_token=True,
add_bos_token=True
)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 2/2 [00:35<00:00, 17.92s/it]


In [8]:
import PyPDF2
import re

def pdf_to_text(pdf_path, skip_start_pages=0, skip_last_pages=0, header_lines=1, footer_lines=1):
  with open(pdf_path, 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    num_pages = len(pdf_reader.pages)

    for page_num in range(skip_start_pages - 1, num_pages-skip_last_pages):
      page = pdf_reader.pages[page_num]
      page_text = page.extract_text()

      lines = page_text.splitlines(True)[header_lines:-footer_lines] # Removing header and footer

      lines_modified = []
      for line in lines:

        # Optional pre-processing of lines to correct errors
        # None        

        lines_modified.append(line)

      lines_joined = "".join(lines_modified)
      text += lines_joined
  return text

pdf_file_path = "../DataSets/Literature Driven Datasets/Applications of CNTs/A Brief Introduction of Carbon Nanotubes- History, Synthesis, and Properties.pdf"
raw_text = pdf_to_text(pdf_file_path, skip_start_pages=5, skip_last_pages=0, header_lines=2, footer_lines=1)
print(len(raw_text))

20701


In [9]:
import json

def format_question_prompt(input):
  return f"""Below is a input text extracted from a document. Generate one question that is answered with the given text. Do not include any answer in the generated text.

### Input:
 {input}
"""

def format_answer_prompt(input,instruction):
  return f"""Below is an instruction that describes a task or a question, paired with an input that provides context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}
"""

def generate_response(prompt):
  chat = [
      {"role": "user", "content": prompt},
  ]
  text = tokenizer.apply_chat_template(chat, tokenize=False)
  encodeds = tokenizer(text, return_tensors="pt", add_special_tokens=False).to('cuda')
  generated_ids = model.generate(**encodeds, pad_token_id=tokenizer.eos_token_id, max_new_tokens=1024, do_sample=True)
  decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  # Remove prompt and special characters
  cleaned_response = decoded.split(prompt, 1)[1]  # Split by prompt and keep the second part
  cleaned_response = tokenizer.clean_up_tokenization(cleaned_response)  # Remove special tokens
  cleaned_response = re.sub(r"\ \[/INST\] ", "", cleaned_response)  # Match and replace [/INST] followed by a space

  return cleaned_response

sample_char_len_ar = [1024, 512, 256]
num_interations = 40
data= []

if not os.path.exists("data.json"):
  for  sample_char_len in sample_char_len_ar:
      for i in range(num_interations):
          print(i)
          random_integer = random.randint(0, len(raw_text) - sample_char_len)
          document_sample = raw_text[random_integer:random_integer+sample_char_len]
          words = document_sample.split()
          words.pop(0)
          words.pop()
          document_sample = " ".join(words)
          document_sample = document_sample.replace("\u2010", " ")
          prompt = format_question_prompt(document_sample)
          instruction = generate_response(prompt)

          prompt = format_answer_prompt(document_sample, instruction)
          output = generate_response(prompt)

          data.append({
            "instruction": instruction,
            "input": ' '.join(document_sample.split()),
            "output": output
          })

  with open("data.json", "w") as f:  # Open in write mode to overwrite
      json.dump(data, f, indent=4)  # Write the formatted data to the file

  print("Data successfully written to JSON file.")

In [10]:
# len(data)

In [11]:
import json
import random
from datasets import Dataset

with open("data.json", "r") as f:
   dataset = json.load(f)

if isinstance(dataset, list):

    random.shuffle(dataset)

    train_dataset_list = dataset[:100]
    eval_dataset_list = dataset[100:]

    for item in train_dataset_list:
        item["input"] = ""

    for item in eval_dataset_list:
        item["input"] = ""

    with open("train_data.json", "w") as f:
        json.dump(train_dataset_list, f)

    with open("eval_data.json", "w") as f:
        json.dump(eval_dataset_list, f)

else:
    print("Data is not a list, cannot shuffle.")

In [12]:
import json
from datasets import Dataset

def format_prompt_user(sample):
    return f"""
Below is an instruction that describes a task or a question, paired with an optional input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{sample["instruction"]}

### Input:
{sample["input"]}
"""

def format_prompt_assistant(sample):
    return f"""
### Response:
{sample["output"]}
"""

def format_prompt_chat(sample):
    chat = [
        {"role": "user", "content": format_prompt_user(sample)},
        {"role": "assistant", "content": format_prompt_assistant(sample)}
    ]
    text = tokenizer.apply_chat_template(chat, tokenize=False)
    return {"formatted_text": text}

def generate_and_tokenize_prompt(sample):
    formatted_text = sample["formatted_text"]
    result = tokenizer(
        formatted_text,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

max_length = 2048  # differs from datasets to datasets

with open("train_data.json", "r") as f:
    train_dataset_list = json.load(f)

with open("eval_data.json", "r") as f:
    eval_dataset_list = json.load(f)

train_dataset = Dataset.from_list(train_dataset_list)
eval_dataset = Dataset.from_list(eval_dataset_list)

formatted_train_dataset = train_dataset.map(lambda x: format_prompt_chat(x))
formatted_eval_dataset = eval_dataset.map(lambda x: format_prompt_chat(x))

tokenized_train_dataset = formatted_train_dataset.map(generate_and_tokenize_prompt)
tokenized_eval_dataset = formatted_eval_dataset.map(generate_and_tokenize_prompt)

print(f'Training data size: {len(tokenized_train_dataset)}')
print(f'Validation data size: {len(tokenized_eval_dataset)}')

Map: 100%|██████████| 100/100 [00:00<00:00, 547.10 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 2498.99 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 224.63 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 527.39 examples/s]

Training data size: 100
Validation data size: 20





In [13]:
per_device_train_batch_size=1,
gradient_accumulation_steps=1,

In [14]:
import torch
import transformers
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from datetime import datetime
import time # Import the time module

# Assume 'model' and 'tokenizer' are already loaded
# from transformers import AutoTokenizer, AutoModelForCausalLM
# model = AutoModelForCausalLM.from_pretrained(...)
# tokenizer = AutoTokenizer.from_pretrained(...)

# Assume 'tokenized_train_dataset' and 'tokenized_eval_dataset' are prepared
# tokenized_train_dataset = ...
# tokenized_eval_dataset = ...


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

project = "ECSS-E-ST-50-51C-finetune-test-#3"
base_model_name = "Mistral-7B-Instruct"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

# Preapre model
# model.config.pretraining_tp = 1 # This may not be necessary depending on the model loading
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=8,
        lora_alpha=64,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] # "lm_head" is not recommended for training with LoRA
    )
model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        gradient_checkpointing=True,
        max_steps=50, #MAX STEPS
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=10, # Changed for more frequent logging
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=10,               # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=10,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="none",            # Set to "none" to disable wandb and log only locally
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

# --- Training Time Measurement ---
start_time = time.time()

# trainer.train(resume_from_checkpoint = True)
train_result = trainer.train()

end_time = time.time()
training_time = end_time - start_time

# --- Print Training Information ---
print("--- Training Summary ---")
print(f"Total training time: {training_time:.2f} seconds")
print(f"Training run name: {run_name}")

# You can also access detailed metrics from the train_result object
print("\n--- Trainer Log History ---")
print(train_result)

trainable params: 20971520 || all params: 3773042688 || trainable%: 0.5558251452256031


max_steps is given, it will override any value given in num_train_epochs
  0%|          | 0/50 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacity of 12.00 GiB of which 0 bytes is free. Of the allocated memory 17.23 GiB is allocated by PyTorch, and 1.32 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import torch
import transformers
import re, random
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, PeftModel, PeftConfig, prepare_model_for_kbit_training, get_peft_model

model_id = "mistralai/Mistral-7B-Instruct-v0.1"

project = "ECSS-E-ST-50-51C-finetune-test-#3"
base_model_name = "Mistral-7B-Instruct"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name
check_point = "checkpoint-50"
adapter_model_id = output_dir + "/" + check_point

bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

config = PeftConfig.from_pretrained(adapter_model_id)

# login(token="insert huggingface access token here")
login(token=hf_api_key)

model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
quantization_config=bnb_config,
device_map='auto',
)

ft_model = PeftModel.from_pretrained(
model,
adapter_model_id,
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
model_id,
padding_side="left",
add_eos_token=True,
add_bos_token=True
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def generate_response(prompt):
  chat = [
      {"role": "user", "content": prompt},
  ]
  text = tokenizer.apply_chat_template(chat, tokenize=False)
  encodeds = tokenizer(text, return_tensors="pt", add_special_tokens=False).to('cuda')
  generated_ids = ft_model.generate(**encodeds, pad_token_id=tokenizer.eos_token_id, max_new_tokens=1024, do_sample=True)
  decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  # Remove prompt and special characters
  cleaned_response = decoded.split(prompt, 1)[1]  # Split by prompt and keep the second part
  cleaned_response = tokenizer.clean_up_tokenization(cleaned_response)  # Remove special tokens
  cleaned_response = re.sub(r"\ \[/INST\] ", "", cleaned_response)  # Match and replace [/INST] followed by a space

  return cleaned_response

# Chat loop
while True:
  user_input = input("You: ")
  if user_input.lower() == "quit":
    break
  response = generate_response(user_input)
  print(f"Computer: {response}")