In [1]:
!pip install -Uqqq pip --progress-bar off
print("Installed PIP")
!pip install -Uqqq git+https://github.com/huggingface/transformers
print("Installed Transformers")
!pip install -Uqqq torch torchvision
print("Installed Torch")
!pip install -qqq -U peft
print("Installed PEFT")
!pip install -Uqqq accelerate
print("Installed Accelerate")
!pip install -qqq loralib==0.1.1 --progress-bar off
!pip install -qqq einops==0.6.1 --progress-bar off
print("Installed LoRA Lib")
!pip install -qqq bitsandbytes
print("Installed BitsAndBytes")
!pip install fsspec -qqq

!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0 -qqq
print("Installed Datasets")

Installed PIP
Installed Transformers
Installed Torch
Installed PEFT
Installed Accelerate
Installed LoRA Lib
Installed BitsAndBytes
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cubinlinker, which is not installed.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires ptxcompiler, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.7 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 15.0.2 which is incompatible.
cudf 23.8.0 requires cuda-python<12.0a0,>=11.7.1, but you have cuda

In [2]:
!pip install huggingface_hub -qqq

In [3]:
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login

from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)

from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig  
)

In [4]:
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map='auto',
    quantization_config=bnb_config,
    use_cache=False,
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [5]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        
        if param.requires_grad:
            trainable_params += param.numel()
    
    print(
        f"Trainable params: {trainable_params} || All params: {all_param} || trainable %: {100* trainable_params/all_param}"
    )

In [6]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [7]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
#     target_modules=['query_key_value'],
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM'
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

Trainable params: 6815744 || All params: 3758886912 || trainable %: 0.18132346515244138


In [8]:
gen_config = model.generation_config
gen_config.max_new_tokens = 200
gen_config.temperature = 0.7
gen_config.top_p = 0.7
gen_config.num_return_sequences = 1
gen_config.pad_token_id = tokenizer.eos_token_id
gen_config.eos_token_id = tokenizer.eos_token_id

In [10]:
from datasets import load_dataset

qa_dataset = load_dataset("suneeln-duke/duke_qac_v3")

Downloading readme:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/309k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/559 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/140 [00:00<?, ? examples/s]

In [11]:
qa_dataset

DatasetDict({
    train: Dataset({
        features: ['Question', 'Context', 'Answer'],
        num_rows: 559
    })
    val: Dataset({
        features: ['Question', 'Context', 'Answer'],
        num_rows: 140
    })
})

In [12]:
def generate_prompt(data_point):
    
    """"
    Update the prompt template:
    Combine both the prompt and input into a single column.

    """     
    bos_token = "<s>"
    
    original_system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    
    system_message = "Use the provided context followed by a question to answer it."
    
    full_prompt = f"""<s>### Instruction:
    {system_message}
    
    ### Context:
    {data_point['Context']}
    
    
    ### Question:
    
    {data_point['Question']}
    
    
    ### Aswer: 
    {data_point['Answer']}
    """
    
    full_prompt = " ".join(full_prompt.split())
    
    return full_prompt

In [13]:
def generate_and_tokenize_prompt(data_point):
    
    full_prompt = generate_prompt(data_point)
    
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    
    return tokenized_full_prompt

In [14]:
train_dataset = qa_dataset["train"].shuffle().map(generate_and_tokenize_prompt)
val_dataset = qa_dataset["val"].shuffle().map(generate_and_tokenize_prompt)

Map:   0%|          | 0/559 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/140 [00:00<?, ? examples/s]

In [15]:
OUTPUT_DIR = "experiments"

In [16]:
train_dataset

Dataset({
    features: ['Question', 'Context', 'Answer', 'input_ids', 'attention_mask'],
    num_rows: 559
})

In [17]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    num_train_epochs = 1,
    learning_rate = 2e-4,
    fp16 = True,
    save_total_limit = 3,
    logging_steps = 10,
    output_dir = OUTPUT_DIR,
    max_steps = 200,
    optim = "paged_adamw_8bit",
    lr_scheduler_type = "cosine",
    warmup_ratio = 0.05,
)

In [18]:
trainer = transformers.Trainer(
    model = model,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    args = training_args,
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

2024-04-16 21:42:02.014806: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-16 21:42:02.014930: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-16 21:42:02.166363: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
max_steps is given, it will override any value given in num_train_epochs


In [19]:
model.config_use_cache = False

trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
10,1.5127
20,1.3353
30,1.1662
40,1.0568
50,0.9145
60,0.6931
70,0.6304
80,0.493
90,0.5146
100,0.3932


TrainOutput(global_step=200, training_loss=0.5552666527032852, metrics={'train_runtime': 4275.4145, 'train_samples_per_second': 0.187, 'train_steps_per_second': 0.047, 'total_flos': 4.805090502303744e+16, 'train_loss': 0.5552666527032852, 'epoch': 1.4311270125223614})

In [20]:
model.save_pretrained("duke_qac_ft_v3")

In [21]:
from huggingface_hub import notebook_login
import huggingface_hub

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
huggingface_hub.login(token='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')

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [23]:
model.push_to_hub(
    "suneeln-duke/dukebot-qac-v2",
    use_auth_token=True
)



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/suneeln-duke/dukebot-qac-v2/commit/89288d21b6382e3ebb209c18bd00b96783be3421', commit_message='Upload model', commit_description='', oid='89288d21b6382e3ebb209c18bd00b96783be3421', pr_url=None, pr_revision=None, pr_num=None)

In [24]:
PEFT_MODEL = "suneeln-duke/dukebot-qac-v2"

dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16

config = PeftConfig.from_pretrained(PEFT_MODEL)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict = True,
    quantization_config = bnb_config,
    device_map = "auto",
    torch_dtype = dtype,
    trust_remote_code = True
)

adapter_config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

In [26]:
model = PeftModel.from_pretrained(model, PEFT_MODEL)

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

In [27]:
model = model.merge_and_unload()



In [28]:
model.push_to_hub(
    "suneeln-duke/dukebot-qac-v1-merged",
    use_auth_token=True
)



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/suneeln-duke/dukebot-qac-v1-merged/commit/5a06d1bfe094d57a2ee5d91b1251751a0948c1a5', commit_message='Upload MistralForCausalLM', commit_description='', oid='5a06d1bfe094d57a2ee5d91b1251751a0948c1a5', pr_url=None, pr_revision=None, pr_num=None)

In [29]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer.push_to_hub(
    "suneeln-duke/dukebot-qac-v1-merged",
    use_auth_token=True
)



CommitInfo(commit_url='https://huggingface.co/suneeln-duke/dukebot-qac-v1-merged/commit/b70688c4caa85a88c17d2d24a1e367b0f446e936', commit_message='Upload tokenizer', commit_description='', oid='b70688c4caa85a88c17d2d24a1e367b0f446e936', pr_url=None, pr_revision=None, pr_num=None)

In [30]:
def generate_prompt_test(data_point):
    
    """"
    Update the prompt template:
    Combine both the prompt and input into a single column.

    """            
    bos_token = "<s>"
    
    original_system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    
    system_message = "Use the provided context followed by a question to answer it."
    
    full_prompt = f"""<s>### Instruction:
    {system_message}
    
    ### Context:
    {data_point['Context']}
    
    
    ### Question:
    
    {data_point['Question']}
    
    
    ### Aswer: 
    """
    
    full_prompt = " ".join(full_prompt.split())
    
    return full_prompt

In [31]:
sample = qa_dataset['val'][5]

In [32]:
DEVICE = 'cuda:0'

In [33]:
%%time

encoding = tokenizer(generate_prompt_test(sample), return_tensors='pt').to(DEVICE)

with torch.inference_mode():
    outputs = model.generate(
        input_ids = encoding.input_ids,
        attention_mask = encoding.attention_mask,
        generation_config = gen_config
    )
    
resp = tokenizer.decode(outputs[0], skip_special_tokens=True)



In [34]:
resp

"### Instruction: Use the provided context followed by a question to answer it. ### Context: engage directly with representatives from their company partner organization as well as receive guidance from Duke faculty members over the course of the project. The team will present their final deliverables to a sponsor panel and/or an external review panel. MENG 550: Master of Engineering Internship or Project- Internships are typically 8-12 weeks. The minimum hourly requirement for the internship is 320 hours, equivalent to 8 weeks, 40 hours per week. Projects require approval from the AIPI program director. Projects must fulfill the same learning objectives as internships. Although students are responsible for finding their own internship, Duke provides an experienced career development team to help with your search. All internships/projects must: Apply engineering principles to solving one or more problems outside the classroom environment Define a problem and determine potential solutio

In [54]:
from typing import Any, Dict, List

import torch

import transformers

from transformers import AutoModelForCausalLM, AutoTokenizer

dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16

class EndpointHandler:
    def __init__(self, path=""):
        tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code = True)
        model = AutoModelForCausalLM.from_pretrained(
            path,
            return_dict = True,
            device_map = "auto",
            load_in_8bit = True,
            torch_dtype = dtype,
            trust_remote_code = True,
        )
        
        gen_config = model.generation_config
        gen_config.max_new_tokens = 100
        gen_config.temperature = 0
        gen_config.num_return_sequences = 1
        gen_config.pad_token_id = tokenizer.eos_token_id
        gen_config.eos_token_id = tokenizer.eos_token_id
        
        self.generation_config = gen_config
        
        self.pipeline = transformers.pipeline(
            'text-generation', model=model, tokenizer=tokenizer
        )
       
     
      
    def __call__(self, data: Dict[dict, Any]) -> Dict[str, Any]:
        question = data.pop("question", data)
        
        context = data.pop("context", None)
        
        temp = data.pop("temp", None)
        
        max_tokens = data.pop("max_tokens", None)
        
        bos_token = "<s>"

        original_system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."

        system_message = "Use the provided context followed by a question to answer it."

        full_prompt = f"""<s>### Instruction:
        {system_message}

        ### Context:
        {context}


        ### Question:

        {question}


        ### Answer: 
        """

        full_prompt = " ".join(full_prompt.split())
        
        self.generation_config.max_new_tokens = max_tokens
        self.generation_config.temperature = temp
        
        result = self.pipeline(full_prompt, generation_config = self.generation_config)[0]['generated_text']
               
        match = re.search(r'### Answer:(.*?)###', result, re.DOTALL)
        
        if match:
            result =  match.group(1).strip()
            
        pattern = r"### Answer:(.*)"

        match = re.search(pattern, result)
        
        if match:
            result = match.group(1).strip()      
        
        return result

In [55]:
MODEL_ID = "suneeln-duke/dukebot-qac-v1-merged"

my_handler = EndpointHandler(path=MODEL_ID)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [56]:
sample['Question']

'What are some of the key components of the summer internship or industry project offered by the sponsoring organization as described in the text?'

In [57]:
%%time
import re
payload = {
    
    "question": sample['Question'],
    "context": sample['Context'],
    "max_tokens": 250,
    "temp": 0.6
    
}

prediction = my_handler(payload)



In [58]:
prediction

'The summer internship or industry project offered by the sponsoring organization in the Duke Artificial Intelligence Master of Engineering Program includes the following key components: 1. Designing a solution to an authentic opportunity offered by the sponsoring organization: Students are tasked with developing a solution to a real-world problem or challenge presented by the organization. This project allows students to apply their technical AI/ML skills and product development knowledge to solve a practical business problem. 2. Gaining industry experience: The internship or industry project provides students with valuable hands-on experience in the field of artificial intelligence, allowing them to work directly with industry professionals and gain insights into the real-world applications of AI technologies. 3. Professional development: The internship or industry project is an opportunity for students to develop their professional skills, such as communication, teamwork, and proble

In [83]:
import re

def extract_response(text):
    match = re.search(r'### Response:(.*?)###', text, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return None

In [94]:
extract_response(prediction[0]['generated_text'])

'Denver Broncos Which NFL team represented the NFC at Super Bowl 50?'