In [None]:
#### When running the inference on Qwen 0.6 with the 4bit quantization, each example
#### Takes about 65 seconds to run. to speed up the inference, this code merges the model with the adapter
#### Merging helped reduce the time from 65 seconds per prompt to 40 seconds
#### After further investigation, i can see that the time taken is primeraly from prefilling (long prompt)

# !pip install bitsandbytes     ## Needed when training on Colab
import os
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# --- Configuration ---
BASE_MODEL_PATH = "Qwen/Qwen3-0.6B"
ADAPTER_PATH = "qwen_qlora_podcast"  # Path to your fine-tuned adapter
MERGED_MODEL_OUTPUT_PATH = "qwen_qlora_podcast_merged"

# Get Hugging Face token
HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
    try:
        with open("../../../secrets/hf_token.txt", "r") as f:
            HF_TOKEN = f.readline().strip()
    except FileNotFoundError:
        print("‚ö†Ô∏è Hugging Face token file not found. This may cause issues.")
        HF_TOKEN = None

def merge_qlora_model():
    """Merges a QLoRA adapter into the base model and saves the full model."""
    print(f"Base model: {BASE_MODEL_PATH}")
    print(f"Adapter: {ADAPTER_PATH}")
    print(f"Output directory: {MERGED_MODEL_OUTPUT_PATH}")

    # 1. Load the base model with 4-bit quantization
    print("\n1. Loading base model in 4-bit...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_PATH,
        quantization_config=bnb_config,
        trust_remote_code=True,
        token=HF_TOKEN,
        device_map="cpu",  # Load on CPU to avoid VRAM issues during merge
    )

    # 2. Load the PEFT model (applying the adapter)
    print("\n2. Loading PEFT model (adapter)...")
    peft_model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)

    # 3. Merge the adapter into the base model
    print("\n3. Merging adapter into the base model...")
    merged_model = peft_model.merge_and_unload()
    print("‚úÖ Merge complete.")

    # 4. Save the merged model and tokenizer
    print(f"\n4. Saving merged model to {MERGED_MODEL_OUTPUT_PATH}...")
    os.makedirs(MERGED_MODEL_OUTPUT_PATH, exist_ok=True)
    merged_model.save_pretrained(MERGED_MODEL_OUTPUT_PATH)

    # Also save the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(ADAPTER_PATH, trust_remote_code=True, token=HF_TOKEN)
    tokenizer.save_pretrained(MERGED_MODEL_OUTPUT_PATH)

    print(f"\nüéâ Merged model saved successfully to {MERGED_MODEL_OUTPUT_PATH}")
    print("You can now load this model directly for fast inference.")

if __name__ == "__main__":
    merge_qlora_model()


‚ö†Ô∏è Hugging Face token file not found. This may cause issues.
Base model: Qwen/Qwen3-0.6B
Adapter: qwen_qlora_podcast
Output directory: qwen_qlora_podcast_merged

1. Loading base model in 4-bit...


model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]


2. Loading PEFT model (adapter)...

3. Merging adapter into the base model...




‚úÖ Merge complete.

4. Saving merged model to qwen_qlora_podcast_merged...

üéâ Merged model saved successfully to qwen_qlora_podcast_merged
You can now load this model directly for fast inference.


In [None]:
### Observation
### Due to the large prompt, each example takes about 40 seconds to run

# !pip install bitsandbytes     ## Needed when training on Colab
import os
import pandas as pd
import torch
from tqdm.auto import tqdm

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

# os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

GEMINI_SERVICE_ACCOUNT_PATH = os.environ.get("GEMINI_SERVICE_ACCOUNT_PATH", "../../../secrets/gemini-service-account.json")
GOOGLE_CLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT", "newsjuice-123456")
GOOGLE_CLOUD_REGION = ( "us-central1")
QWEN_MODEL_PATH = ("Qwen/Qwen3-0.6B")
QWEN_MAX_NEW_TOKENS = ("512")
QWEN_TEMPERATURE = ("0.7")
PODCAST_LOG_CSV =("podcast_results.csv")
WANDB_PROJECT = "newsjuice-finetune"

try:
    WANDB_API_KEY = os.environ.get("WANDB_API_KEY")
except:
    with open("../../../secrets/wandb_api_key.txt", "r") as f:
        WANDB_API_KEY = f.readline().strip()

try:
    HF_TOKEN = os.environ.get("HF_TOKEN")
except:
    with open("../../../secrets/hf_token.txt", "r") as f:
        HF_TOKEN = f.readline().strip()
def _infer_compute_dtype():
    """Return the best available compute dtype for QLoRA training."""
    if torch.cuda.is_available():
        try:
            major, _ = torch.cuda.get_device_capability()
            if major >= 8:
                return torch.bfloat16
        except Exception:
            pass
        return torch.float16
    return torch.float32


def run_predictions_on_finetuned_model(
    merged_model_path: str = "qwen_qlora_podcast_merged",
    csv_path: str = PODCAST_LOG_CSV,
    output_csv_path: str = None,
    max_new_tokens: int = 512,
    temperature: float = 0.7,
):

    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found: {csv_path}")

    if not os.path.exists(merged_model_path):
        raise FileNotFoundError(f"Merged model path not found: {merged_model_path}")

    print("Evaluating Merged Fine-tuned Qwen Model")

    # Load the CSV
    df = pd.read_csv(csv_path)
    if "chunk_text" not in df.columns:
        raise ValueError("CSV must contain 'chunk_text' column")

    print(f"Found {len(df)} examples to evaluate")

    # Load the merged model directly in bfloat16 for fast inference
    print(f"\nüöÄ Loading merged model from {merged_model_path}...")
    model = AutoModelForCausalLM.from_pretrained(
        merged_model_path,
        token=HF_TOKEN,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        device_map="auto" if torch.cuda.is_available() else None,
    )
    model.eval()

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(merged_model_path, token=HF_TOKEN, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print("Model and adapter loaded successfully")
    print("Starting inference...")

    # Generate predictions for each example
    finetuned_outputs = []
    device = "cuda" if torch.cuda.is_available() else "cpu"

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Inferencing", unit="example"):

        chunk_text = row["chunk_text"]
        if pd.isna(chunk_text) or str(chunk_text).strip() == "":
            finetuned_outputs.append("")
            continue

        # Build the prompt in the same format as training
        prompt = (
            f"""### Instruction:\n
        You are a news podcast host. Based on the following relevant news articles, create an engaging podcast-style script to the user's question.
        The script must be no longer than 300 words under any circumstance. Make sure you dont go over the spesified word limit You should only include the text of the script. Do not include any of your thoughts or any sound effects.

        Please create a podcast-style response that:
        1. Starts with a warm, engaging introduction
        2. Directly addresses the user's question using information from the articles
        3. Weaves together insights from the relevant news articles
        4. Maintains a conversational, podcast-like tone
        5. Ends with a thoughtful conclusion that stays within the 300-word limit

        If the articles don't contain enough information to fully answer the question, acknowledge this and provide what insights you can while being transparent about limitations.

        Format your response as if you're speaking directly to the listener in a podcast episode.
            ### Input:\n{str(chunk_text).strip()}\n\n
            ### Response:\n"""
        )

        # Tokenize and generate
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=temperature > 0,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        # Decode and extract only the generated response (not the prompt)
        full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if "### Response:" in full_text:
            response = full_text.split("### Response:")[1].strip()
        else:
            response = full_text[len(prompt):].strip()

        finetuned_outputs.append(response)

        tqdm.write(f"‚úÖ Processed {idx + 1}/{len(df)} examples")

    # Add new column to dataframe
    df["finetuned Qwen 0.6"] = finetuned_outputs

    # Save to CSV
    output_path = output_csv_path if output_csv_path else csv_path
    df.to_csv(output_path, index=False)

    print("Evaluation completed!")

    return output_path


run_predictions_on_finetuned_model()

Evaluating Merged Fine-tuned Qwen Model


`torch_dtype` is deprecated! Use `dtype` instead!


Found 2000 examples to evaluate

üöÄ Loading merged model from qwen_qlora_podcast_merged...
Model and adapter loaded successfully
Starting inference...


Inferencing:   0%|          | 0/2000 [00:00<?, ?example/s]

‚úÖ Processed 1/2000 examples
‚úÖ Processed 2/2000 examples
‚úÖ Processed 3/2000 examples
‚úÖ Processed 4/2000 examples
‚úÖ Processed 5/2000 examples
‚úÖ Processed 6/2000 examples
‚úÖ Processed 7/2000 examples
‚úÖ Processed 8/2000 examples
‚úÖ Processed 9/2000 examples
‚úÖ Processed 10/2000 examples
‚úÖ Processed 11/2000 examples
‚úÖ Processed 12/2000 examples
‚úÖ Processed 13/2000 examples
‚úÖ Processed 14/2000 examples
‚úÖ Processed 15/2000 examples
‚úÖ Processed 16/2000 examples
‚úÖ Processed 17/2000 examples
‚úÖ Processed 18/2000 examples
‚úÖ Processed 19/2000 examples
‚úÖ Processed 20/2000 examples
‚úÖ Processed 21/2000 examples
‚úÖ Processed 22/2000 examples
‚úÖ Processed 23/2000 examples
‚úÖ Processed 24/2000 examples
‚úÖ Processed 25/2000 examples
‚úÖ Processed 26/2000 examples
‚úÖ Processed 27/2000 examples
‚úÖ Processed 28/2000 examples
‚úÖ Processed 29/2000 examples
‚úÖ Processed 30/2000 examples
‚úÖ Processed 31/2000 examples
‚úÖ Processed 32/2000 examples
‚úÖ Processed 33/