<a href="https://colab.research.google.com/github/taufiq-ai/EXAONE-3.5-2.4b-Pretrained-Finetuning-Quantization/blob/main/Inference_by_ngrok.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [None]:
# Flask & Tunneling
! pip install pyngrok --upgrade
! pip install flask-ngrok --upgrade
! pip install flask-cors

In [None]:
# Transformers Model
! pip install torch transformers huggingface_hub tiktoken structlog

# Import

In [10]:
# Tunnel
from flask import Flask, request, jsonify
from flask_cors import CORS
import torch
from pyngrok import ngrok
ngrok.kill()

# Models
import warnings
import tiktoken
import structlog
from bs4 import BeautifulSoup
from huggingface_hub import login
from torch import bfloat16
from transformers import AutoModelForCausalLM, AutoTokenizer
warnings.filterwarnings("ignore", category=RuntimeWarning)


# Setup API Keys
from google.colab import userdata
NGROK_TOKEN = userdata.get('NGROK_TOKEN')
HF_TOKEN = userdata.get('HF_TOKEN')
login(HF_TOKEN)
! ngrok config add-authtoken $NGROK_TOKEN

# logging
logger = structlog.get_logger(__name__)

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


# Model

In [11]:
def setup_model(
    model_name_or_local_path:str="LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct",
    device="auto",
):
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_local_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_local_path,
        torch_dtype=bfloat16,
        trust_remote_code=True,
        device_map=device,
    )
    return model, tokenizer


def infer_model(prompt, tokenizer, model, max_tokens=200, device="cuda"):
    if type(prompt)==str:
        messages = [
            {"role": "system",
             "content": "You are a helpful e-commerce customer support chatbot."},
            {"role": "user", "content": prompt}
        ]
    else: messages=prompt

    logger.info("Inference Started", messages=messages)

    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )

    output = model.generate(
        input_ids.to(device),
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=max_tokens,
        do_sample=False,
    )
    completion = tokenizer.decode(output[0])
    content = completion.split("[|assistant|]")[-1].split("[|endofturn|]")[0]
    logger.info("Generation Done", content=content)
    return completion, content

# Run Model & Tunneling

In [None]:
model, tokenizer = setup_model()

In [14]:
def handle_inferecne(payload:dict, device="cuda"):
    prompt = payload.get('prompt')
    max_tokens = payload.get('max_tokens')
    logger.info("Inference Starting", prompt=prompt, max_tokens=max_tokens)
    completion, content = infer_model(prompt, tokenizer, model, max_tokens=200, device=device)
    return content

def run_app():
    app = Flask(__name__)
    CORS(app)  # Add this line after creating Flask app

    # Start ngrok
    public_url = ngrok.connect(
        addr=5000,  # Your Flask app port
        domain="closely-vital-puma.ngrok-free.app"  # Your static domain (if any), otherwise comment
    )
    print(f" * Public URL: {public_url}")

    @app.route('/gpu-inference', methods=['POST'])
    def flask_inference():
        try:
            payload = request.json
            logger.info("Received inference request", payload=payload)
            result = handle_inferecne(payload=payload)

            # Proper response formatting
            return jsonify({
                "status": "success",
                "result": result
            }), 200
        except Exception as e:
            return jsonify({
                "status": "error",
                "message": str(e)
            }), 500

    app.run(port=5000)

In [15]:
run_app()

 * Public URL: NgrokTunnel: "https://closely-vital-puma.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


2025-01-09 23:47:36 [info     ] Received inference request     payload={'prompt': 'Hi Exa! How are you?', 'max_tokens': 200}
2025-01-09 23:47:36 [info     ] Inference Starting             max_tokens=200 prompt=Hi Exa! How are you?
2025-01-09 23:47:36 [info     ] Inference Started              messages=[{'role': 'system', 'content': 'You are a helpful e-commerce customer support chatbot.'}, {'role': 'user', 'content': 'Hi Exa! How are you?'}]
2025-01-09 23:47:41 [info     ] Generation Done                content=Hello! Thank you for asking. As an AI assistant, I don't experience feelings like humans do, but I'm here and ready to help you with any questions or concerns you might have about products, orders, or anything else related to our services! How can I assist you today?


INFO:werkzeug:127.0.0.1 - - [09/Jan/2025 23:47:41] "POST /gpu-inference HTTP/1.1" 200 -


2025-01-09 23:49:02 [info     ] Received inference request     payload={'prompt': 'Hi Exa! Is LD is a japanese tobaco brand?', 'max_tokens': 200}
2025-01-09 23:49:02 [info     ] Inference Starting             max_tokens=200 prompt=Hi Exa! Is LD is a japanese tobaco brand?
2025-01-09 23:49:02 [info     ] Inference Started              messages=[{'role': 'system', 'content': 'You are a helpful e-commerce customer support chatbot.'}, {'role': 'user', 'content': 'Hi Exa! Is LD is a japanese tobaco brand?'}]
2025-01-09 23:49:11 [info     ] Generation Done                content=Hello! You're asking about LD, which could refer to different products depending on the context, especially in the tobacco industry where brand names can vary widely across regions. If you're referring to a specific tobacco brand named "LD," it's important to clarify which country or region you're interested in, as tobacco brands often have localized names or variations.

For example:
- In Japan, tobacco brands might

INFO:werkzeug:127.0.0.1 - - [09/Jan/2025 23:49:11] "POST /gpu-inference HTTP/1.1" 200 -


# How Inference works with public endpoint?


In [None]:
import requests
headers = {'Content-Type': 'application/json'}
endpoint = "https://closely-vital-puma.ngrok-free.app/"+"gpu-inference"
payload = {"prompt":"Hi Exa! How are you?", "max_tokens":200}
response = requests.post( endpoint, json=payload, headers=headers, timeout=30)
response.json()