In [1]:
# !pip install git+https://github.com/huggingface/transformers.git@main accelerate
# !pip install wandb
# %run dependencies.ipynb
# !pip -qq install -U bitsandbytes accelerate
# !where python

In [1]:
import json
import os
import random
import re
from datetime import datetime
from functools import partial

import accelerate
import bitsandbytes as bnb
import pandas as pd
import torch

from datasets import load_dataset,concatenate_datasets, load_from_disk, Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from transformers import pipeline, LlamaForCausalLM, LlamaTokenizerFast, AutoTokenizer, Trainer, TrainingArguments, \
                         BitsAndBytesConfig, DataCollatorForLanguageModeling, TrainerCallback, AutoModelForCausalLM

In [2]:
torch.cuda.is_available()

True

In [3]:
# !export PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True,max_split_size_mb=1024'

In [5]:
path = "/notebooks/fsdp_qlora/models"

repo = "redis"
model_name = f"CodeLlama-13b-Instruct-{repo}-QLoRA-3"
file_path = f"{path}/{model_name}_predictions.csv"

In [6]:
for i in range(torch.cuda.device_count()):
    device_name = torch.cuda.get_device_name(i)
    print(f"Device {i}: {device_name}")

Device 0: NVIDIA GeForce RTX 4090
Device 1: NVIDIA GeForce RTX 4090


In [None]:
BENCH = "/streamlit_app"
versions = [v for v in os.listdir(BENCH) if v.startswith("bench-v") and os.path.isdir(f"{BENCH}/{v}")]

In [8]:
with open("/bench/bench-v0.6.json", 'r') as f:
    ben_json = json.load(f)


def extract_signature(row):
    code = ben_json[row['Unnamed: 0']]['code']
    signature = code.split('{')[0]
    return re.sub(f"[\r\n\t]+", "", signature)


# def generate_prompt(row):
#     lang = "C++"
#     if row['repository'] == "openssl" or row['repository'] == "redis":
#         lang = 'C'
#     system = f"Generate a single function on {lang} programming language satisfying the following signature and description. Please, do not add any comments or any text other than code for this function. Do not add includes, parent class or other methods."
#     user = f"Function signature: {extract_signature(row)}. Function description: {row['doc']}."
#     prompt = f"<s>[INST] <<SYS>> {system} \n <</SYS>>\n{user} [/INST]"
#     return prompt


def generate_prompt(row):
    lang = "C++"
    if row["repository"] in ["openssl", "redis"]:
        lang = "C"
    function_description = row["doc"]
    system_message = {
        "role": "system",
        "content": f"Generate code on {lang}:\n",
        # "content": f"\nGenerate a function on {lang} programming language.\n",
        # "content": f"You're a specialized AI assisting with generating function code on {lang}. You are very good at generating code.",
        # "content": f"\nGenerate a single function on {lang} programming language satisfying the following signature and description. Please, do not add any comments or any text other than code for this function. Do not add includes, parent class or other methods.",
    }
    prompt = {
        "role": "user",
        "content": f'Function signature: {extract_signature(row)}. Function description: {function_description}.',
    }
    return [system_message, prompt]

In [9]:
import torch

from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from safetensors import safe_open
from transformers import LlamaForCausalLM, BitsAndBytesConfig, GenerationConfig


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.float16,
)
model = LlamaForCausalLM.from_pretrained(
    "codellama/CodeLlama-13b-Instruct-hf",
    quantization_config=bnb_config,
    # attn_implementation="flash_attention_2",
    device_map="auto",
    resume_download=None,
)
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-13b-Instruct-hf", resume_download=None)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

for param in model.parameters():
    param.requires_grad = False

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
if any(x in model_name.lower() for x in ["hqq", "bnb", "qlora", "qdora"]):
    tensors = {}
    with safe_open(f'{path}/{model_name}/model_state_dict.safetensors', framework="pt") as f:
        for k in f.keys():
            tensors[k] = f.get_tensor(k)
    
    # Add LoRA (make sure your rank (r) and alpha (lora_alpha) values match those used in training!)
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM, inference_mode=False, r=64, lora_alpha=16, lora_dropout=0.1,
        target_modules=["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"]
    )
    model = get_peft_model(model, peft_config)
    
    new_sd = model.state_dict()
    for k in new_sd:
        if ("qlora" in model_name.lower() and 'lora' in k) or ("qdora" in model_name.lower() and k in tensors):
            new_sd[k] = tensors[k]
    
    model.load_state_dict(new_sd)

In [11]:
print(f"Memory footprint: {model.get_memory_footprint() / 1e9:.2f} GB")
print(f"Number of parameters: {model.num_parameters() / 1e9:.2f} B")

Memory footprint: 8.23 GB
Number of parameters: 13.24 B


In [12]:
generation_config = GenerationConfig(
    max_new_tokens=384,
    do_sample=False,
    num_beams=10,
    num_beam_groups=10,
    diversity_penalty=1.0,
    num_return_sequences=10,
    # top_k=10,
    # top_p=0.95,
    # temperature=0.8,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

In [13]:
def inference(sample):
    prompt = generate_prompt(sample)
    inputs = tokenizer.apply_chat_template(
        prompt,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)

    outputs = model.generate(
        inputs,
        generation_config=generation_config,
    )
    return [
        tokenizer.decode(
            output[len(inputs[0]) :],
            skip_special_tokens=True,
        ).strip()
        for output in outputs
    ]

In [14]:
def save_predictions(candidates):
    df_to_save = pd.DataFrame.from_dict(
        candidates,
        orient='index',
        columns=[f'{i}' for i in range(generation_config.num_return_sequences)]
    )
    df_to_save.to_csv(file_path)


def load_predictions():
    if not os.path.isfile(file_path):
        return {}

    df = pd.read_csv(file_path)
    candidates = {}
    for _, row in df.iterrows():
        candidates[row["Unnamed: 0"]] = [row[f"{i}"] for i in range(10)]
    return candidates

In [15]:
print(datetime.now())

2024-05-30 18:49:08.649151


In [None]:
import pandas as pd
from tqdm.notebook import tqdm

df = pd.read_csv('/bench/bench-v0.6.1.csv')
code = []

candidates = {} if debug else load_predictions()
for index, row in tqdm(df.iterrows(), total=len(df)):
    if row["Unnamed: 0"] in candidates:
        continue

    if debug:
        prompt = generate_prompt(row)
        print(tokenizer.apply_chat_template(
            prompt,
            add_generation_prompt=True,
            tokenize=False,
        ))
        r = inference(row)
        for i in range(5):
            print(r[i])
            print("---------------------------------------------------------------------------------------------------------------------------------------------")
        break
    else:
        candidates[row["Unnamed: 0"]] = inference(row)

if not debug:
    save_predictions(candidates)

  0%|          | 0/214 [00:00<?, ?it/s]

In [None]:
print(datetime.now())

In [None]:
len(candidates.keys())

In [17]:
k = "79D9B4DB619F85EB" # list(candidates)[0]
for i in range(10):
    print(candidates[k][i])
    print("---------------------------------------------------------------------------------------------------------------------------------------------")

```
SIMD_FORCE_INLINE void internalApplyImpulse(const btVector3& linearComponent, const btVector3& angularComponent, const btScalar impulseMagnitude) {
    // Calculate the impulse magnitude
    btScalar impulse = impulseMagnitude * (1.0f / (1.0f + linearComponent.lengthSquared()));

    // Calculate the linear impulse
    btVector3 linearImpulse = linearComponent * impulse;

    // Calculate the angular impulse
    btVector3 angularImpulse = angularComponent * impulse;

    // Apply the impulse
    body->applyImpulse(linearImpulse, angularImpulse);
}
```
This function takes in three parameters:

* `linearComponent`: The linear component of the impulse.
* `angularComponent`: The angular component of the impulse.
* `impulseMagnitude`: The magnitude of the impulse.

The function first calculates the impulse magnitude by dividing the given impulse magnitude by the length squared of the linear component. This is done to ensure that the impulse magnitude is normalized.

Next, the function c

In [17]:
save_predictions(candidates)