In [None]:
!pip install pyngrok

In [None]:
ngrok_token =          # Write your ngrok token here
!ngrok config add-authtoken {ngrok_token}

In [None]:
%%writefile app.py
from fastapi import FastAPI, Request
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from google.colab import userdata

hf_token = None  # Write your Huggingface token here

# Load model and tokenizer
model_name = "Qwen/Qwen2.5-1.5B-Instruct" # Write your Huggingface Model Name Here
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    token=hf_token
)
model.eval()
if torch.cuda.is_available():
    model.to("cuda")

# Define input schema
class Prompt(BaseModel):
    text: str

# Create FastAPI app
app = FastAPI()

@app.post("/generate")
async def generate_text(prompt: Prompt):
    input_ids = tokenizer(prompt.text, return_tensors="pt").input_ids
    if torch.cuda.is_available():
        input_ids = input_ids.to("cuda")

    with torch.no_grad():
        output = model.generate(input_ids, max_new_tokens=100)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return {"response": response}


In [None]:
from pyngrok import ngrok
import uvicorn
import threading
import time


# Run FastAPI app in a separate thread
def run_app():
    uvicorn.run("app:app", host="0.0.0.0", port=8000)

thread = threading.Thread(target=run_app)
thread.start()

# Start ngrok tunnel
time.sleep(180) # adjust the time until the model loaded successfully before running the next cell

In [None]:
public_url = ngrok.connect(8000)
print(f"Public URL: {public_url}")