In [None]:
!pip install fastapi uvicorn nest_asyncio pyngrok
!pip install -U bitsandbytes accelerate transformers peft datasets

In [2]:
%%writefile app.py

from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch, gc
gc.collect()
torch.cuda.empty_cache()
# ====== Load model ======
model_id = "/kaggle/input/eia_model_v1/pytorch/default/1/exmodel"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    llm_int8_enable_fp32_cpu_offload=True
)

max_memory = {0: "12GiB", "cpu": "48GiB"}  # GPU T4 chỉ có 16GB

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    max_memory=max_memory
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

def generate_code(prompt, max_new_tokens=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        num_beams=2
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ====== API ======
app = FastAPI(title="Excel AI Assistant API")

class Instruction(BaseModel):
    instruction: str

@app.post("/generate")
def generate(instruction: Instruction):
    prompt = f"### Instruction:\n{instruction.instruction}\n\n### Response:\n"
    result = generate_code(prompt)
    return {"instruction": instruction.instruction, "response": result}


Writing app.py


In [None]:
import nest_asyncio
from pyngrok import ngrok

nest_asyncio.apply()

ngrok.set_auth_token("xxxxxxxxxxxxxxxxxxx_612ke3kcCkuiWqGQLytJ1") #đọc file AuthToken.txt để lấp Auth Token

public_url = ngrok.connect(8000)
print("Public URL:", public_url)

Public URL: NgrokTunnel: "https://f8bd6e11ef07.ngrok-free.app" -> "http://localhost:8000"           


In [None]:
!uvicorn app:app --host 0.0.0.0 --port 8000 --reload

[32mINFO[0m:     Will watch for changes in these directories: ['/kaggle/working']
[32mINFO[0m:     Uvicorn running on [1mhttp://0.0.0.0:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started reloader process [[36m[1m123[0m] using [36m[1mStatReload[0m
config.json: 100%|█████████████████████████████| 646/646 [00:00<00:00, 4.81MB/s]
2025-09-19 06:47:45.368772: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758264465.561938     125 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758264465.623507     125 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
model.safetensors.index.json: 25.1kB [00:00, 96.6MB/s]
Fetching 2 files:   0%|            