In [None]:
import json
import os, json, random, torch, numpy as np
from datasets import Dataset, concatenate_datasets
random.seed(42); np.random.seed(42); torch.manual_seed(42)

import transformers, platform
# check for Transformers version
# print("Transformers:", transformers.__version__, "Python:", platform.python_version())

COURSE_JSONL  = "course_qa_training_dataset_2500.jsonl"
GENERAL_JSONL = "course_qa_general.jsonl"

if not os.path.exists(GENERAL_JSONL):
    with open(GENERAL_JSONL, "w", encoding="utf-8") as f:
        for r in general_seed: f.write(json.dumps(r)+"\n")
print("General seed at:", GENERAL_JSONL)

def read_jsonl(path):
    rows=[]
    with open(path,"r",encoding="utf-8") as f:
        for line in f:
            if line.strip():
                rows.append(json.loads(line))
    return rows

course  = read_jsonl(COURSE_JSONL)
general = read_jsonl(GENERAL_JSONL)

SYSTEM = ("You are CourseBot for WRPHTC. You can answer general questions briefly, "
          "but prioritize recommending courses from this catalog when relevant.")

def to_prompt(q,a):
    return f"<s>[SYSTEM]\n{SYSTEM}\n[/SYSTEM]\n[USER]\n{q}\n[/USER]\n[ASSISTANT]\n{a}\n</s>"

course_ds  = Dataset.from_list([{"text": to_prompt(r["question"], r["answer"])} for r in course])
general_ds = Dataset.from_list([{"text": to_prompt(r["question"], r["answer"])} for r in general])

# Oversample general to ~15% of the mix
GEN_MULT = max(1, int(0.15 * len(course_ds) / max(1, len(general_ds))))
blended   = concatenate_datasets([course_ds, *( [general_ds]*GEN_MULT )]).shuffle(seed=42)
split     = blended.train_test_split(test_size=0.1, seed=42)

len(course_ds), len(general_ds), len(blended), {k:len(v) for k,v in split.items()}

# check the first 10 rows of data; comment out this section if need
print("=== Preview of Blended Training Data ===")
for i in range(10):   # print first 10 rows
    row = blended[i]
    print(f"Sample {i+1}:")
    print(row["text"])
    print("-" * 80)


General seed at: course_qa_general.jsonl
=== Preview of Blended Training Data ===
Sample 1:
<s>[SYSTEM]
You are CourseBot for WRPHTC. You can answer general questions briefly, but prioritize recommending courses from this catalog when relevant.
[/SYSTEM]
[USER]
What skills or topics does Addressing Flu Vaccine Hesitancy During the COVID-19 Pandemic cover?
[/USER]
[ASSISTANT]
This course covers skills and topics such as Seasonal flu vaccination is more important than ever amidst the COVID-19 pandemic. Unfortunately, vaccine hesitancy is a common issue that health care professionals encounter. This training will describe seasonal flu, the flu vaccine, and the synergistic epidemics of COVID-19 and influenza, and introduce strategies to address vaccine hesitance. Course Objectives: Describe seasonal flu and seasonal flu vaccination Discuss the importance of seasonal flu vaccination during the COVID-19 pandemic Identify reasons for vaccine hesitancy Introduce strategies to address seasonal 

In [None]:
# Install libraries
!pip -q install "unsloth>=2024.9.0" "transformers>=4.41" "datasets>=2.19" \
                "accelerate>=0.33" "bitsandbytes>=0.43.0" "trl>=0.9.6" peft==0.11.1


In [None]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [None]:
from unsloth import FastLanguageModel

model_name = "unsloth/Llama-3.2-3B-Instruct"

max_seq_length = 1024  # Choose sequence length
dtype = None  # Auto detection

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.11: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [None]:

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16, lora_alpha=16, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)
tokenizer.pad_token = tokenizer.eos_token


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.9.11 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [None]:
#@title Tokenize the dataset
from transformers import DataCollatorForLanguageModeling
MAX_LEN = 1024
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

tok       = split.map(tokenize, batched=True, remove_columns=split["train"].column_names)
collator  = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# tokenized sample
ex = tok["train"][0]
print("input_ids (first 40):", ex["input_ids"][:40])
print("decoded:\n", tokenizer.decode(ex["input_ids"]))


Map:   0%|          | 0/2522 [00:00<?, ? examples/s]

Map:   0%|          | 0/281 [00:00<?, ? examples/s]

input_ids (first 40): [128000, 45147, 31868, 47587, 933, 2675, 527, 17026, 24406, 369, 468, 22394, 2607, 34, 13, 1472, 649, 4320, 4689, 4860, 27851, 11, 719, 63652, 65774, 14307, 505, 420, 16808, 994, 9959, 627, 25130, 47587, 933, 58, 6584, 933, 3923, 374]
decoded:
 <|begin_of_text|><s>[SYSTEM]
You are CourseBot for WRPHTC. You can answer general questions briefly, but prioritize recommending courses from this catalog when relevant.
[/SYSTEM]
[USER]
What is the course Strengthening Native Food Sovereignty to Preserve Native American Culture and Improve Community Health about?
[/USER]
[ASSISTANT]
The course 'Strengthening Native Food Sovereignty to Preserve Native American Culture and Improve Community Health' is about ***This course has expired and is no longer available An updated version of this course can be found HERE.*** Environmental factors and federal policies have had a large impact on the health and cultural identity of our Indigenous populations With increasing rates of diab

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

BATCH_SIZE = 2                   # T4 safe
GRAD_ACC   = 8                   # effective batch ~16
EPOCHS     = 2                   # light patch; increase if needed
LR         = 1e-4
LOG_STEPS  = 25

# Compat shim in case your environment dislikes 'evaluation_strategy'
def make_training_args(**kw):
    try:
        return TrainingArguments(**kw, evaluation_strategy="steps", eval_steps=200, save_steps=200)
    except TypeError:
        kw2 = kw.copy()
        kw2["do_eval"] = True
        kw2["eval_steps"] = 200
        kw2["save_steps"] = 200
        return TrainingArguments(**kw2)

args = make_training_args(
    output_dir="/content/outputs_coursebot",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    fp16 = not is_bfloat16_supported(),   # T4 => fp16 True, bf16 False
    bf16 = is_bfloat16_supported(),
    logging_steps=LOG_STEPS,
    save_total_limit=2,
    optim="adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    report_to="none",
    seed=42,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tok["train"],
    eval_dataset=tok["test"],
    args=args,
    data_collator=collator,
    max_seq_length=MAX_LEN,
    packing=False,
)


In [None]:
# Train the model
trainer.train()
print("Eval:", trainer.evaluate())
print(trainer.state)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128009}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,522 | Num Epochs = 2 | Total steps = 316
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,2.5994
50,1.5614
75,1.2442
100,1.1374
125,1.0227
150,1.0128
175,0.8496
200,0.8828
225,0.8525
250,0.7724


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Eval: {'eval_loss': 0.8265876173973083, 'eval_runtime': 35.0396, 'eval_samples_per_second': 8.019, 'eval_steps_per_second': 4.024, 'epoch': 2.0}
TrainerState(epoch=2.0, global_step=316, max_steps=316, logging_steps=25, eval_steps=200, save_steps=200, train_batch_size=2, num_train_epochs=2, num_input_tokens_seen=0, total_flos=1.4533251465486336e+16, log_history=[{'loss': 2.5994, 'grad_norm': 0.45873597264289856, 'learning_rate': 9.948440919541278e-05, 'epoch': 0.1586042823156225, 'step': 25}, {'loss': 1.5614, 'grad_norm': 0.5248482823371887, 'learning_rate': 9.60452758972477e-05, 'epoch': 0.317208564631245, 'step': 50}, {'loss': 1.2442, 'grad_norm': 0.5803699493408203, 'learning_rate': 8.958941223943291e-05, 'epoch': 0.47581284694686754, 'step': 75}, {'loss': 1.1374, 'grad_norm': 0.4990091025829315, 'learning_rate': 8.05397845201344e-05, 'epoch': 0.63441712926249, 'step': 100}, {'loss': 1.0227, 'grad_norm': 0.814436674118042, 'learning_rate': 6.948929366463396e-05, 'epoch': 0.7930214115

In [None]:
# Checks for trained bot answers
SYSTEM_INFER = ("You are CourseBot for WRPHTC. You can answer general questions briefly, "
                "but prioritize recommending courses from this catalog when relevant. Be concise and specific.")

FEWSHOT = """
[USER]
hello
[/USER]
[ASSISTANT]
Hi! Iâ€™m CourseBot. What public health topics interest you (e.g., emergency preparedness, nutrition, leadership)?
"""

MAX_NEW_TOKENS = 160

def ask_coursebot(q, max_new=MAX_NEW_TOKENS):
    prompt = f"<s>[SYSTEM]\n{SYSTEM_INFER}\n[/SYSTEM]\n{FEWSHOT}\n[USER]\n{q}\n[/USER]\n[ASSISTANT]\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    model.eval()
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new,
            temperature=0.5,
            top_p=0.9,
            repetition_penalty=1.15,
            do_sample=True,
        )
    txt = tokenizer.decode(out[0], skip_special_tokens=True)
    return txt.split("[ASSISTANT]")[-1].strip()

print("Greeting:", ask_coursebot("hi"))
print("Course rec:", ask_coursebot("Hello"))
print("Format:", ask_coursebot("are there any free courses"))


Greeting: Hello again! Since we just started our conversation, what brings you here today? Are you looking to explore a new topic or find resources for an upcoming course?
Course rec: Hello again! Public health is a broad field with many areas of focus. Here are some popular options:

1. **Environmental Health**: Learn about the impact of human activities on the environment and how to mitigate these effects.

2. **Global Health**: Explore issues like infectious diseases, healthcare access, and global health policy.

3. **Health Promotion**: Discover strategies to encourage healthy behaviors and prevent disease in populations.

4. **Leadership in Public Health**: Develop skills to lead and manage public health programs effectively.

5. **Epidemiology**: Study the study of the distribution and determinants of health-related events, diseases, or health-related characteristics among populations.

6. **Nutrition and Food Systems**: Investigate the role of food systems in promoting health an

In [None]:
# Save LoRA adapters (small, good for future patch-training) use this module only to save work
ADAPTER_DIR = "/content/coursebot-lora-single"
trainer.model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)
print("Saved LoRA to:", ADAPTER_DIR)


Saved LoRA to: /content/coursebot-lora-single


In [None]:
# ==== NO-MERGE TESTER (fresh session: base + adapters) ====
# Make sure you have your adapters folder available, e.g. /content/coursebot-lora-single

!pip -q install "unsloth>=2024.9.0" "transformers>=4.41" "accelerate>=0.33" "bitsandbytes>=0.43.0" peft==0.11.1

import torch, time
from unsloth import FastLanguageModel
from peft import PeftModel

BASE_MODEL  = "meta-llama/Llama-3.2-3B-Instruct"   # requires HF token/access
ADAPTER_DIR = "/content/coursebot-lora-single"     # <-- change if needed
MAX_SEQ_LEN = 1024                                 # T4-safe; raise only if you must

# If the model is gated, make sure you've logged in to HF and pass your token when needed:
# from huggingface_hub import login; login(token="HF_TOKEN")

base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = BASE_MODEL,
    max_seq_length = MAX_SEQ_LEN,
    dtype          = None,
    load_in_4bit   = True,
    # token="HF_TOKEN",  # uncomment & set if you need gated access
)

# Turn base into a LoRA target and attach your trained adapters (no merge)
base_model = FastLanguageModel.get_peft_model(
    base_model,
    r=16, lora_alpha=16, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)
tokenizer.pad_token = tokenizer.eos_token

# Load your saved adapters
peft_model = PeftModel.from_pretrained(base_model, ADAPTER_DIR, is_trainable=False)

# --- Same inference helpers as above ---
TEMP = 0.5
TOP_P = 0.9
REPEAT_PENALTY = 1.15
MAX_NEW_TOKENS = 160

SYSTEM = ("You are CourseBot for WRPHTC. You can answer general questions briefly, "
          "but when learning is requested, recommend only courses in this catalog. "
          "Ask for a short clarification if needed. Keep answers under 4 sentences.")

FEWSHOT = """
[USER]
hello
[/USER]
[ASSISTANT]
Hi! Iâ€™m CourseBot. What public health topics interest you (e.g., emergency preparedness, nutrition, leadership)?
"""

def build_prompt(user_text: str) -> str:
    return (f"<s>[SYSTEM]\n{SYSTEM}\n[/SYSTEM]\n"
            f"{FEWSHOT}\n[USER]\n{user_text}\n[/USER]\n[ASSISTANT]\n")

def generate_answer(model, tokenizer, user_text: str):
    prompt = build_prompt(user_text)
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    model.eval()
    with torch.no_grad():
        start = time.time()
        out = model.generate(
            **inputs,
            do_sample=True,
            temperature=TEMP,
            top_p=TOP_P,
            repetition_penalty=REPEAT_PENALTY,
            max_new_tokens=MAX_NEW_TOKENS,
        )
        elapsed = time.time() - start
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    answer = text.split("[ASSISTANT]")[-1].strip()
    return answer, elapsed

def quick_tests(model, tokenizer, questions):
    for q in questions:
        ans, t = generate_answer(model, tokenizer, q)
        print(f"\nQ: {q}\nA: {ans}\n(time: {t:.2f}s)")

test_questions = [
    "hello",
    "Iâ€™m interested in disaster recoveryâ€”what courses should I take?",
    "Are these courses self-paced?",
    "Recommend courses for global health organizations.",
    "Compare two courses on emergency preparedness and leadership."
]
quick_tests(peft_model, tokenizer, test_questions)


==((====))==  Unsloth 2025.9.11: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Q: hello
A: Hello again! If you're looking to learn more about public health, I have some course recommendations: 

* Public Health Fundamentals 
* Emergency Preparedness and Response
* Nutrition Science for Public Health

Would you like me to elaborate on any of these?
(time: 2.96s)

Q: Iâ€™m interested in disaster recoveryâ€”what courses should I take?
A: For disaster recovery and management, consider taking:

* HSC 301: Emergency Management and Response
* HSC 401: Disaster Planning and Coordination
* HSC 402: Public Health Preparedne

In [3]:
!pip -q install huggingface_hub>=0.24 transformers>=4.41 peft==0.11.1
from huggingface_hub import login
from getpass import getpass

hf_token = getpass("Enter your Hugging Face token (read scope):")
login(token=hf_token)


Enter your Hugging Face token (read scope):Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·


In [6]:
import os, torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

BASE_MODEL  = "meta-llama/Llama-3.2-3B-Instruct"     # <- change if different
ADAPTER_DIR = "/content/coursebot-lora-single"       # <- your saved LoRA adapters
MERGED_DIR  = "/content/coursebot-merged-single"     # <- output HF dir

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, token=hf_token)

# Load base in fp16 on CPU to avoid T4 OOM during merge
base_fp16 = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="cpu",
    token=hf_token,
)

# Attach adapters and merge
peft_model = PeftModel.from_pretrained(base_fp16, ADAPTER_DIR, is_trainable=False)
merged = peft_model.merge_and_unload()   # now a standard fp16 model

# Save Hugging Face format
os.makedirs(MERGED_DIR, exist_ok=True)
merged.save_pretrained(MERGED_DIR, safe_serialization=True)
tokenizer.save_pretrained(MERGED_DIR)
print("âœ… Merged model saved to:", MERGED_DIR)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

âœ… Merged model saved to: /content/coursebot-merged-single


In [7]:
# Export to save work
from google.colab import files
!zip -r coursebot_artifacts.zip /content/coursebot-lora-single /content/coursebot-merged-single
files.download("coursebot_artifacts.zip")

  adding: content/coursebot-lora-single/ (stored 0%)
  adding: content/coursebot-lora-single/tokenizer.json (deflated 85%)
  adding: content/coursebot-lora-single/tokenizer_config.json (deflated 96%)
  adding: content/coursebot-lora-single/adapter_model.safetensors (deflated 8%)
  adding: content/coursebot-lora-single/chat_template.jinja (deflated 71%)
  adding: content/coursebot-lora-single/special_tokens_map.json (deflated 63%)
  adding: content/coursebot-lora-single/README.md (deflated 66%)
  adding: content/coursebot-lora-single/adapter_config.json (deflated 54%)
  adding: content/coursebot-lora-single/.ipynb_checkpoints/ (stored 0%)
  adding: content/coursebot-merged-single/ (stored 0%)
  adding: content/coursebot-merged-single/tokenizer.json (deflated 85%)
  adding: content/coursebot-merged-single/config.json (deflated 52%)
  adding: content/coursebot-merged-single/tokenizer_config.json (deflated 96%)
  adding: content/coursebot-merged-single/model-00002-of-00002.safetensors (def

FileNotFoundError: Cannot find file: coursebot_artifacts.zip

In [35]:

import os
print("Files in llama.cpp:", os.listdir("/content/llama.cpp")[:20])
print("Has convert_hf_to_gguf.py:", os.path.isfile("/content/llama.cpp/convert-hf-to-gguf.py"))

Files in llama.cpp: ['SECURITY.md', 'models', '.gitignore', 'scripts', 'CMakePresets.json', 'tests', 'media', 'LICENSE', 'flake.nix', 'poetry.lock', 'mypy.ini', '.pre-commit-config.yaml', 'pyproject.toml', '.dockerignore', '.github', 'pocs', '.clang-format', 'pyrightconfig.json', 'Makefile', '.flake8']
Has convert_hf_to_gguf.py: False


In [39]:

# Get llama.cpp tools
# !git -q clone https://github.com/ggerganov/llama.cpp
!pip -q install --upgrade pip
!pip -q install "mistral_common>=1.3.0" "safetensors>=0.4.5" "sentencepiece>=0.1.99" \
                "tokenizers>=0.15" "transformers>=4.41" "huggingface_hub>=0.24" \
                "numpy>=1.24" "tqdm>=4.66" "protobuf>=4.25"

HF_DIR   = "/content/coursebot-merged-single"   # merged HF folder
GGUF_OUT = "/content/coursebot-q4_k_m.gguf"     # output file

!python /content/llama.cpp/convert_hf_to_gguf.py {HF_DIR} --outfile {GGUF_OUT} --outtype f16

import os
print("âœ… GGUF saved at:", GGUF_OUT, "Size:", os.path.getsize(GGUF_OUT)/1e6, "MB")


INFO:hf-to-gguf:Loading model: coursebot-merged-single
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {3072, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> F16, shape = {8192, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.at

In [40]:
from google.colab import files
files.download(GGUF_OUT)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

unknown option: -q
usage: git [--version] [--help] [-C <path>] [-c <name>=<value>]
           [--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
           [-p | --paginate | -P | --no-pager] [--no-replace-objects] [--bare]
           [--git-dir=<path>] [--work-tree=<path>] [--namespace=<name>]
           [--super-prefix=<path>] [--config-env=<name>=<envvar>]
           <command> [<args>]
[Errno 2] No such file or directory: 'llama.cpp'
/content/llama.cpp
python3: can't open file '/content/llama.cpp/convert-hf-to-gguf.py': [Errno 2] No such file or directory


FileNotFoundError: [Errno 2] No such file or directory: '/content/coursebot-q4_k_m.gguf'