In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# ---------------------------
# 0. Setup & Imports
# ---------------------------

!pip install -U bitsandbytes --quiet

from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig
import torch

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# ---------------------------
# 1. Ask user for project path
# ---------------------------

project_path = input("‚û°Ô∏è Enter the full path to the project folder (ex: /content/drive/MyDrive/MyProject) : ").strip()
relative_path = Path(project_path)
model_path = str(relative_path / "my-qwen-model")

‚û°Ô∏è Enter the full path to the project folder (ex: /content/drive/MyDrive/MyProject) : /content/drive/MyDrive/Deep Learning Project


In [None]:
# ---------------------------
# 2. 4-bit quantization config
# ---------------------------

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)


In [None]:
# ---------------------------
# 3. Load tokenizer & model
# ---------------------------

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # ensure padding consistency

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="auto"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
# ---------------------------
# 4. Fix generation_config for Qwen
# ---------------------------

gen_config = GenerationConfig(
    do_sample=False,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3,
    eos_token_id=tokenizer.eos_token_id,
)

model.generation_config = gen_config

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=1024, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1024, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear4bit(in_features=1024, out_features=1024, bias=False)
          (v_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=102

In [None]:
# ---------------------------
# 5. Inference function
# ---------------------------

def ask_question(question):
    prompt = (
        "<|im_start|>user\n"
        f"{question}\n"
        "<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=gen_config.do_sample,
            repetition_penalty=gen_config.repetition_penalty,
            no_repeat_ngram_size=gen_config.no_repeat_ngram_size,
            eos_token_id=gen_config.eos_token_id,
        )


    # Decode full text (keep special tokens)
    text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Extract only the assistant answer
    if "<|im_start|>assistant" in text:
        answer = text.split("<|im_start|>assistant")[1]
        answer = answer.split("<|im_end|>")[0].strip()
    else:
        answer = text.strip()

    answer = answer.replace("<think>", "").replace("</think>", "").strip()

    return answer


In [None]:
# ---------------------------
# 6. Ask user for their question
# ---------------------------
user_question = input("\n‚ùì Enter your question: ").strip()


‚ùì Enter your question: What are the symptoms of diabete ?


In [None]:
# ---------------------------
# 7. Compute & print answer
# ---------------------------
answer = ask_question(user_question)

print("\n==============================")
print("üß† MODEL RESPONSE")
print("==============================")
print(f"Q: {user_question}")
print(f"A: {answer}")
print("==============================\n")

Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.



üß† MODEL RESPONSE
Q: What are the symptoms of diabete ?
A: The signs and symptoms of diabetes can vary from person to person. Some people may not have any problems at all, while others will experience a range of health issues.
                
Key Signs & Symptoms
                
- Fatigue  - Dry mouth  - Nausea or stomach discomfort  - Weight gain  - Blurred vision  - High blood sugar levels in urine    - Increased thirst   - Frequent urination     - Slow healing of wounds    - Skin changes such as dryness, peeling skin, or rashes    - Hair loss    - Tingling sensations on certain areas (like feet)    - Hearing loss    Other Health Risks

