In [1]:
# must download when new kernel created
!pip install git+https://github.com/huggingface/transformers -qqq
!pip install datasets -qqq
!pip install bitsandbytes -qqq
!pip install huggingface_hub -qqq
!pip install peft -qqq
!pip install accelerate -qqq
!pip install trl -qqq
!pip install wandb -qqq
!pip install ipywidgets -qqq
!pip install -U "huggingface_hub[cli]" -qqq

In [None]:
# login outside of jupyter notebook

# !huggingface-cli whoami

# import wandb
# wandb.login()

# !pip list

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = "mistralai/Mistral-7B-v0.1"
# model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    padding_side="left", # set as left for mistral, right for llama2
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)


inf_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
]
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

device = "cuda"
model_inputs = encodeds.to(device)

generated_ids = inf_model.generate(model_inputs, max_new_tokens=256, pad_token_id=tokenizer.pad_token_id)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



<s> [INST] What is your favourite condiment? [/INST]

I'm not sure if this is a condiment, but I love ketchup. I'm not sure why, but I love it. I'm not a big fan of mustard, but I do like it on hot dogs. I'm not a big fan of mayonnaise, but I do like it on sandwiches. I'm not a big fan of relish, but I do like it on hot dogs. I'm not a big fan of pickles, but I do like them on sandwiches. I'm not a big fan of olives, but I do like them on pizza. I'm not a big fan of cheese, but I do like it on pizza. I'm not a big fan of tomatoes, but I do like them on pizza. I'm not a big fan of onions, but I do like them on pizza. I'm not a big fan of peppers, but I do like them on pizza. I'm not a big fan of mushrooms, but I do like them on pizza. I'm not a big fan of sausage, but I do like it on pizza. I'm not a big fan


In [8]:
messages = [
    {"role": "user", "content": "How can I jump higher?"},
]
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

device = "cuda"
model_inputs = encodeds.to(device)

generated_ids = inf_model.generate(model_inputs, max_new_tokens=256, pad_token_id=tokenizer.pad_token_id)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

<s> [INST] How can I jump higher? [/INST]

The most important thing is to have a good technique.

1. The first thing is to have a good technique.

2. The second thing is to have a good technique.

3. The third thing is to have a good technique.

4. The fourth thing is to have a good technique.

5. The fifth thing is to have a good technique.

6. The sixth thing is to have a good technique.

7. The seventh thing is to have a good technique.

8. The eighth thing is to have a good technique.

9. The ninth thing is to have a good technique.

10. The tenth thing is to have a good technique.

11. The eleventh thing is to have a good technique.

12. The twelfth thing is to have a good technique.

13. The thirteenth thing is to have a good technique.

14. The fourteenth thing is to have a good technique.

15. The fifteenth thing is to have a good technique.

16. The sixteenth thing is to have a good technique.


In [9]:
from transformers import (pipeline, logging)

# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "How can I jump higher?"
pipe = pipeline(task="text-generation", model=inf_model, tokenizer=model_name, max_length=256)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] How can I jump higher? [/INST]

The most important thing is to have a good technique.

1. The first thing is to have a good technique.

2. The second thing is to have a good technique.

3. The third thing is to have a good technique.

4. The fourth thing is to have a good technique.

5. The fifth thing is to have a good technique.

6. The sixth thing is to have a good technique.

7. The seventh thing is to have a good technique.

8. The eighth thing is to have a good technique.

9. The ninth thing is to have a good technique.

10. The tenth thing is to have a good technique.

11. The eleventh thing is to have a good technique.

12. The twelfth thing is to have a good technique.

13. The thirteenth thing is to have a good technique.

14. The fourteenth thing is to have a good technique.

15. The fifteenth thing is to have a good technique.




In [10]:
test_set = ["how can I jump higher","how to dunk?","are barefoot shoots actually beneficial?","what are the most important factors to becoming an elite athlete?",
"what are the core principles behind nutrition?",
"how do i lose weight while gaining muscle",
"how do i get more energy?",
"are plant based diets beneficial for better health?",
"what are some quick wins to becoming a better athlete?",
"does contrast therapy (ice bath and hot sauna) actually work?",
"how exercises should I focus on at the gym to improve my basketball performance?"]

for i, prompt in enumerate(test_set):
    print(f"test case: {i+1}/{len(test_set)}")
    result = pipe(f"<s>[INST] {prompt} [/INST]")
    print(result[0]['generated_text'])
    
    # tokenizer.padding_side = "left"
    # model_input = tokenizer(prompt, return_tensors="pt").to(device)
    
    # _ = inf_model.eval()
    # with torch.no_grad():
    #   out = inf_model.generate(**model_input, max_new_tokens=216)
    
    # print(tokenizer.decode(out[0], skip_special_tokens=True))
    print()
    

test case: 1/11
<s>[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST] how can I jump higher [/INST]

[INST]

test case: 2/11
<s>[INST] how to dunk? [/INST]

I'm not sure if this is the right place to ask this, but I'm trying to learn how to dunk. I'm 6'1" and 160 lbs, and I'm not sure if I'm strong enough to dunk. I've tried a few times, but I'm not sure if I'm doing it right. I've tried jumping

In [10]:
# Empty VRAM
# del model_inputs
# del inf_model
# del base_model_reload
# del peft_model
# del model_input
# del load_model
# del model
# del pipe
# del inf_model
# del merged_model

import gc
gc.collect()
gc.collect()

torch.cuda.empty_cache()
torch.cuda.memory_allocated()

8519680

In [21]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!nvidia-smi

Tue Jan 23 02:34:09 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.146.02             Driver Version: 535.146.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A10G                    Off | 00000000:00:1E.0 Off |                    0 |
|  0%   22C    P0              59W / 300W |  16068MiB / 23028MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
import wandb
run = wandb.init()
artifact = run.use_artifact('ongjjbrandon/mistral-pjf-v3/model-gaflufph:v0', type='model')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Currently logged in as: [33mongjjbrandon[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact model-gaflufph:v0, 352.52MB. 8 files... 
[34m[1mwandb[0m:   8 of 8 files downloaded.  
Done. 0:0:0.4


In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
import torch
model_name = "mistralai/Mistral-7B-v0.1"
base_model_reload = AutoModelForCausalLM.from_pretrained(
    model_name, low_cpu_mem_usage=True,
    return_dict=True,torch_dtype=torch.bfloat16,
    device_map= {"": 0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
from peft import PeftModel

model = PeftModel.from_pretrained(base_model_reload, "./artifacts/model-gaflufph:v0")
model = model.merge_and_unload()

In [15]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token
tokenizer.add_tokens(["<|im_start|>"])
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id
# peft_model.config.use_cache = True

In [25]:
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    # system_prompt = 'The conversation between Human and AI named PaulGPT\n'
    # B_INST, E_INST = "[INST]", "[/INST]"
    B_INST, E_INST = "<|im_start|>", "<|im_end|>"

    # prompt = f"{system_prompt}{B_INST}{user_prompt.strip()}\n{E_INST}"
    prompt = f"{B_INST} {user_prompt.strip()} {E_INST}"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)

    _ = merged_model.generate(**inputs, streamer=streamer, max_new_tokens=256)

In [26]:
stream("are plant based diets beneficial?")


What are the benefits of a plant based diet? 

A plant based diet can be beneficial for overall health, as it is rich in fiber,<|im_end|>, and other<|im_end|><|im_end|>. It can also help with weight loss, as it is lower in calories and can be more filling than a traditional diet. Additionally, a plant based diet can be more sustainable and environmentally friendly, as it requires less resources to produce. However, it is important to be aware of the potential downsides of a plant based diet, such as the lack of certain nutrients like B12 and the potential for deficiencies in other nutrients like iron and zinc. It is also important to be aware of the potential for overconsumption of certain plant based foods, like processed grains and legumes, which can lead to health issues like obesity and diabetes.

What are the potential downsides of a plant based diet? 

The potential downsides of a plant based diet include the lack of certain nutrients like B12, the potential for deficiencies in 

In [27]:
test_set = ["how can I jump higher","how to dunk?","are barefoot shoots actually beneficial?","what are the most important factors to becoming an elite athlete?",
"what are the core principles behind nutrition?",
"how do i lose weight while gaining muscle",
"how do i get more energy?",
"are plant based diets beneficial for better health?",
"what are some quick wins to becoming a better athlete?",
"does contrast therapy (ice bath and hot sauna) actually work?",
"how exercises should I focus on at the gym to improve my basketball performance?"]

for i, prompt in enumerate(test_set):
    print("test case:", i+1)
    stream(prompt)
    print()
    

test case: 1

What is the best way to jump higher?

The best way to jump higher is to focus on improving your vertical jump. This can be done by incorporating various exercises and techniques into your training routine. For example, you can start by doing a standing vertical jump and gradually progress to a running vertical jump. By doing this, you can improve your overall jumping ability and increase your vertical jump by up to 10 inches.

How can I improve my vertical jump?

To improve your vertical jump, you can start by incorporating various exercises and techniques into your training routine. For example, you can start by doing a standing vertical jump and gradually progress to a running vertical jump. By doing this, you can improve your overall jumping ability and increase your vertical jump by up to 10 inches. Additionally, you can also incorporate exercises like box jumps and hurdle hops to further enhance your jumping skills.

What is the best way to improve my vertical jump?


In [22]:
#push the model to hub
# !huggingface-cli login
new_model_name = "pjf-ft-v3-non-instruct-chatml"
model.push_to_hub(new_model_name, use_temp_dir=False)
# tokenizer.push_to_hub(new_model_name, use_temp_dir=False)

FileNotFoundError: [Errno 2] No such file or directory: 'pjf-ft-v3-non-instruct-chatml'

In [33]:
prompt = "What is a neural network??"

device = "cuda" if torch.cuda.is_available() else "cpu"
model_input = tokenizer(prompt, return_tensors="pt").to(device)

_ = peft_model.eval()
with torch.no_grad():
    out = peft_model.generate(**model_input, max_new_tokens=100)

print(tokenizer.decode(out[0], skip_special_tokens=False))

<s> What is a neural network??</s>щее
What is a neural network and how does it work? A neural network is a type of machine learning<|im_end|><|im_end|><|im_end|> that<|im_end|><|im_end|><|im_end|> that is inspired by the structure and function of the human brain. It consists of a network of interconnected nodes, or<|im_end|>, that are<|im_end|> that are organized into layers. The input layer receives the input data, the hidden layer processes the data, and the output layer produces the final result. The network learns by adjusting the weights between the


In [30]:
new_model_name = "mistral-7b-non-instruction-pjf-ft"
peft_model.push_to_hub(new_model_name)
# tokenizer.push_to_hub(new_model, use_temp_dir=False, token=hf_token)

adapter_model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ogbrandt/mistral-7b-non-instruction-pjf-ft/commit/28d26c0d588aba1c9dd02a09aae1d77ebf3d4f29', commit_message='Upload model', commit_description='', oid='28d26c0d588aba1c9dd02a09aae1d77ebf3d4f29', pr_url=None, pr_revision=None, pr_num=None)

In [3]:
# del load_model
# torch.cuda.empty_cache()
!nvidia-smi

Tue Jan 23 04:12:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.146.02             Driver Version: 535.146.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A10G                    Off | 00000000:00:1E.0 Off |                    0 |
|  0%   22C    P0              59W / 300W |  22422MiB / 23028MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
# Load model directly
from transformers import AutoModelForCausalLM
import torch
# remote_model = AutoModel.from_pretrained("ogbrandt/mistral7b-pjf-ft-v0")

# Reload model in FP16 and merge it with LoRA weights
base_model = "mistralai/Mistral-7B-v0.1"
# base_model = "meta-llama/Llama-2-7b-chat-hf"

load_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16,
    device_map={"": 0},
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
from peft import PeftModel

# adapter_model = "llama-2-7b-chat-pjf"
adapter_model = "ogbrandt/mistral-7b-non-instruction-pjf-ft"
model = PeftModel.from_pretrained(load_model, adapter_model)
# model = model.merge_and_unload()

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [29]:
prompt = "best exercises to increase vertical jump for dunking?"

device = "cuda" if torch.cuda.is_available() else "cpu"
model_input = tokenizer(prompt, return_tensors="pt").to(device)

_ = model.eval()
with torch.no_grad():
    out = model.generate(**model_input, max_new_tokens=100)

print(tokenizer.decode(out[0], skip_special_tokens=False))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> best exercises to increase vertical jump for dunking?
What are the best exercises to increase vertical jump for dunking

The best exercises to increase vertical jump for dunking

The best exercises to increase vertical jump for dunk are those that focus on building explosive power and agility. These include plyometric exercises such as box jumps, bounding, and lateral bounds. Additionally, exercises that target the hip flexors, such as the standing hip flexor stretch and the standing hip flexor march, can also


In [2]:
from transformers import pipeline

new_model = "ogbrandt/mistral-7b-non-instruction-pjf-ft"
pipe = pipeline(
    "text-generation", 
    model = new_model,
    device_map="cuda:0"
)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacty of 21.99 GiB of which 93.38 MiB is free. Process 10592 has 21.89 GiB memory in use. Of the allocated memory 21.60 GiB is allocated by PyTorch, and 6.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
prompt = "what is a neural net?"

sequences = pipe(
    f"<s>[INST] {prompt} [/INST]",
    do_sample=True,
    max_new_tokens=200, 
    temperature=0.7, 
    # top_k=50, 
    # top_p=0.95,
    num_return_sequences=1,
)

print(sequences[0]['generated_text'])

In [None]:
test_set = ["how can I jump higher","how to dunk?","are barefoot shoots actually beneficial?","what are the most important factors to becoming an elite athlete?",
"what are the core principles behind nutrition?",
"how do i lose weight while gaining muscle",
"how do i get more energy?",
"are plant based diets beneficial for better health?",
"what are some quick wins to becoming a better athlete?",
"does contrast therapy (ice bath and hot sauna) actually work?",
"how exercises should I focus on at the gym to improve my basketball performance?"]

for i, prompt in enumerate(test_set):
    print(f"test case: {i+1}/{len(test_set)}")
    result = pipe(
    f"<s>[INST] {prompt} [/INST]",
    do_sample=True,
    max_new_tokens=200, 
    temperature=0.7, 
    top_k=50, 
    top_p=0.95,
    num_return_sequences=1,)
    print(result[0]['generated_text'])
    

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

logging.set_verbosity(logging.CRITICAL)

model_name = "ogbrandt/mistral-7b-non-instruction-pjf-ft"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)


inf_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)



Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading adapter weights from ogbrandt/mistral-7b-non-instruction-pjf-ft led to unexpected keys not found in the model:  ['model.layers.0.mlp.up_proj.lora_A.default.weight', 'model.layers.0.mlp.up_proj.lora_B.default.weight', 'model.layers.0.mlp.down_proj.lora_A.default.weight', 'model.layers.0.mlp.down_proj.lora_B.default.weight', 'model.layers.1.mlp.up_proj.lora_A.default.weight', 'model.layers.1.mlp.up_proj.lora_B.default.weight', 'model.layers.1.mlp.down_proj.lora_A.default.weight', 'model.layers.1.mlp.down_proj.lora_B.default.weight', 'model.layers.2.mlp.up_proj.lora_A.default.weight', 'model.layers.2.mlp.up_proj.lora_B.default.weight', 'model.layers.2.mlp.down_proj.lora_A.default.weight', 'model.layers.2.mlp.down_proj.lora_B.default.weight', 'model.layers.3.mlp.up_proj.lora_A.default.weight', 'model.layers.3.mlp.up_proj.lora_B.default.weight', 'model.layers.3.mlp.down_proj.lora_A.default.weight', 'model.layers.3.mlp.down_proj.lora_B.default.weight', 'model.layers.4.mlp.up_proj.lor

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

messages = [
    {"role": "user", "content": "quick wins for dunking?"},
]
encodeds = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")

device = "cuda"
model_inputs = encodeds.to(device)

generated_ids = inf_model.generate(model_inputs, max_new_tokens=256, pad_token_id=tokenizer.pad_token_id)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

<s> [INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

quick wins for dunking? [/INST]

To improve your dunking<</SYS>>, focus on building a strong foundation by working on your vertical jump and agility. This will help you develop the necessary skills for dunking<</SYS>>ing. Additionally, work on your overall athleticism and agility, as these are essential for improving your dunk<</SYS>>ing ability.

Also, consider incorporating a plyometric program into your routine, as it can help improve your overall agility 

In [None]:
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    # system_prompt = 'The conversation between Human and AI named PaulGPT\n'
    # B_INST, E_INST = "[INST]", "[/INST]"
    B_INST, E_INST = "<|im_start|>", "<|im_end|>"

    # prompt = f"{system_prompt}{B_INST}{user_prompt.strip()}\n{E_INST}"
    prompt = f"{B_INST} {user_prompt.strip()} {E_INST}"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)

    _ = merged_model.generate(**inputs, streamer=streamer, max_new_tokens=256)