In [None]:
!pip install -q unsloth accelerate datasets peft bitsandbytes trl
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
import torch


model_name = "ypavanr/llama3-taxbot-lora"


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 4096,
    dtype = torch.float16,
    load_in_4bit = True,    
)


FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

prompt = "<|system|>\nYou are a helpful tax advisor AI.\n<|user|>\nlist all the deductions\n<|assistant|>\n"


inputs = tokenizer(prompt, return_tensors="pt").to("cuda")


outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.7, top_p=0.9)


response = tokenizer.decode(outputs[0], skip_special_tokens=True)
if "<|assistant|>" in response:
    response = response.split("<|assistant|>")[1].split("<|user|>")[0].strip()
print(response)


In [None]:
!pip install fastapi uvicorn pyngrok transformers accelerate bitsandbytes peft torch --quiet
!pip install nest_asyncio --quiet


In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import uvicorn
from pyngrok import ngrok
import nest_asyncio
from fastapi.middleware.cors import CORSMiddleware

ngrok.set_auth_token(NGROK_AUTH_TOKEN)

nest_asyncio.apply()

app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # or ["*"] to allow all
    allow_methods=["*"],
    allow_headers=["*"],
)

class PromptRequest(BaseModel):
    prompt: str

@app.post("/generate")
def generate(request: PromptRequest):
    full_prompt = f"<|system|>\nYou are a helpful tax advisor AI.\n<|user|>\n{request.prompt}\n<|assistant|>\n"
    inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.7, top_p=0.9)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
  
    if "<|assistant|>" in response:
        response = response.split("<|assistant|>")[1].split("<|user|>")[0].strip()
    return {"response": response}


public_url = ngrok.connect(8000)
print(f"🔗 Public URL: {public_url}")


uvicorn.run(app, port=8000)                     