In [None]:
!pip install pyngrok transformers==4.40.0 sentencepiece==0.1.99 colorlog==6.8.2 uvicorn==0.29.0 pydantic==2.9.2 torch==2.1.2 flash-attn fastapi==0.111.0 torchvision==0.16.2 accelerate==0.27.2 bitsandbytes Pillow==10.1.0 nest_asyncio

In [None]:
import logging
import os

from colorlog import ColoredFormatter


class SingletonMeta(type):
    """A metaclass for the Singleton pattern."""

    _instances = {}

    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super().__call__(*args, **kwargs)
        return cls._instances[cls]


class Logger(metaclass=SingletonMeta):
    def __init__(self, level=logging.INFO):
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(level)

        handler = logging.StreamHandler()

        # Color formatting
        log_colors = {
            "DEBUG": "cyan",
            "INFO": "green",
            "WARNING": "yellow",
            "ERROR": "red",
            "CRITICAL": "red,bg_white",
            "TIMER": "blue",
        }

        formatter = ColoredFormatter(
            "%(log_color)s%(levelname)-8s%(reset)s \033[37m%(message)s\033[0m \033[37m[%(module)s/%(funcName)s/line %(lineno)d]\033[0m",
            datefmt=None,
            reset=True,
            log_colors=log_colors,
        )

        handler.setFormatter(formatter)
        self.logger.addHandler(handler)
        logging.addLevelName(25, "TIMER")
        self.logger.timer = self.timer

    def get_logger(self):
        return self.logger

    def timer(self, message, *args, **kwargs):
        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger._log(25, message, args, **kwargs)

logger_level = os.getenv("LOGGER_LEVEL", "INFO").upper()
log_level = getattr(logging, logger_level, logging.INFO)
logger = Logger(log_level).get_logger()

In [None]:
from PIL import Image
import base64
from io import BytesIO

import logging
import os



def decode_base64_to_image(image_base64: str) -> Image.Image:
    try:
        image_data = base64.b64decode(image_base64)
        image = Image.open(BytesIO(image_data)).convert("RGB")
        logger.info("Image decoded successfully")
        return image
    except Exception as e:
        print(f"Error decoding base64 image: {str(e)}")
        raise


In [None]:
!ngrok config add-authtoken <missing token>

In [None]:
from transformers import AutoModel, AutoTokenizer
import logging
import os



class MiniCPM_V_2_6_Int4:
    def load(self):
        self.model_name = "openbmb/MiniCPM-V-2_6-int4"
        try:
            self.model = AutoModel.from_pretrained(
                self.model_name, trust_remote_code=True
            )
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name, trust_remote_code=True
            )
            self.model.eval()
            logger.info(f"Model {self.model_name} loaded successfully.")
        except Exception as e:
            logger.error(f"Failed to load model {self.model_name}: {str(e)}")
            raise e

    def infer(self, base64_image: str, question: str):
        try:
            image = decode_base64_to_image(base64_image)
            msgs = [{"role": "user", "content": [image, question]}]
            result = self.model.chat(image=None, msgs=msgs, tokenizer=self.tokenizer)
            logger.info("Inference completed successfully.")
            return result
        except Exception as e:
            logger.error(f"Inference failed: {str(e)}")
            raise e


In [None]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import nest_asyncio
from pyngrok import ngrok
import uvicorn
import logging
import os


app = FastAPI()
model_instance = MiniCPM_V_2_6_Int4()
model_instance.load()


class MultimodalRequest(BaseModel):
    question: str
    base64_image: str


class MultimodalResponse(BaseModel):
    prediction: str


@app.get("/health_check")
async def health_check():
    logger.info("Health check called.")
    return {"status": "Healthy"}


@app.post("/infer")
async def infer(infer_request: MultimodalRequest):
    try:
        logger.info("Received inference request.")
        prediction = model_instance.infer(
            infer_request.base64_image, infer_request.question
        )
        logger.info("Returning inference result.")
        return MultimodalResponse(prediction=prediction)
    except Exception as e:
        logger.error(f"Error during inference request: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))


# Connect to ngrok on Colab
ngrok_tunnel = ngrok.connect("9200")
print("Public URL:", ngrok_tunnel.public_url)

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Function to forward requests from Colab to your local server


if __name__ == "__main__":
    logger.info("Starting the Uvicorn server...")
    uvicorn.run(app, host="0.0.0.0", port=9200)
    logger.info("Server has started.")
