In [1]:
import torch
!pip install gradio
from transformers import AutoModelForCausalLM, AutoTokenizer
import gradio as gr

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.pad_token = tokenizer.eos_token

# Set paths to your saved models
# Replace these paths with the actual paths to your saved SFT and DPO models
sft_model_path = "/kaggle/input/model-5/other/default/1/tinyllama-qa-exp-lowtemp"  # Example: "./tinyllama-qa-exp-lowtemp"
dpo_model_path = "/kaggle/input/dpo-model/other/default/1/dpo_trial5"  # Example: "./dpo_trial3"

# Load models
# Note: If your SFT and DPO models are saved asa adapters, use the following instead:
# from peft import PeftModel
# base_model_for_adapters = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
# sft_model = PeftModel.from_pretrained(base_model_for_adapters, sft_model_path)
# dpo_model = PeftModel.from_pretrained(base_model_for_adapters, dpo_model_path)
base_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
sft_model = AutoModelForCausalLM.from_pretrained(sft_model_path, torch_dtype=torch.float16, device_map="auto")
dpo_model = AutoModelForCausalLM.from_pretrained(dpo_model_path, torch_dtype=torch.float16, device_map="auto")

# Define generate answer function
def generate_answer(question, model_choice):
    prompt = f"Question: {question} Answer:"
    if model_choice == "Base":
        model = base_model
    elif model_choice == "SFT":
        model = sft_model
    elif model_choice == "DPO":
        model = dpo_model
    else:
        return "Invalid model choice."
    
    device = next(model.parameters()).device
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.float16):
        output = model.generate(
            input_ids,
            max_new_tokens=50,
            pad_token_id=tokenizer.eos_token_id,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1
        )
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    if "Answer:" in generated_text:
        answer = generated_text.split("Answer:")[1].strip()
    else:
        answer = generated_text.strip()
    return answer

# Define example questions
example_questions = [
    "What is the capital city of Japan?",
    "Who wrote the novel 'Pride and Prejudice'?",
    "What is the chemical symbol for gold?",
    "In which year did the Titanic sink?",
    "What is the largest mammal on Earth?",
    "Who painted the Mona Lisa?",
    "What is the main source of energy for Earth's climate system?",
    "What is the longest river in the world?",
    "Who discovered penicillin?",
    "What is the primary language spoken in Brazil?"
]

# Create Gradio interface
iface = gr.Interface(
    fn=generate_answer,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your question here..."),
        gr.Radio(["Base", "SFT", "DPO"], label="Select Model", value="DPO")
    ],
    outputs="text",
    title="TinyLlama Question Answering",
    description="Ask a question and select a model (Base, SFT, or DPO) to get an answer. The DPO model is the most fine-tuned, followed by SFT, with Base being the original model.",
    examples=[[q, "DPO"] for q in example_questions]
)

# Launch the interface
iface.launch()

Collecting gradio
  Downloading gradio-5.33.2-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.6.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.10.3 (from gradio)
  Downloading gradio_client-1.10.3-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Co

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

2025-06-12 17:05:29.516062: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749747929.751740      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749747929.822929      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

* Running on local URL:  http://127.0.0.1:7860
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://9b658ad882fdf93641.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


