## GLM-4.1V Vibe Test

In [1]:
!pip install -U -q git+https://github.com/huggingface/transformers.git
!pip install flask pyngrok transformers accelerate torch

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


Load the model.

In [2]:
from transformers import AutoProcessor, Glm4vForConditionalGeneration
import torch
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from pyngrok import ngrok
import os
from PIL import Image
import re


MODEL_PATH = "THUDM/GLM-4.1V-9B-Thinking"
processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
model = Glm4vForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path=MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
def infer(messages, max_new_tokens=8192):
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device)
    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return extract_answer(output_text)



def extract_answer(text):
    match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
    return match.group(1).strip() if match else "Unsupported file format or OCR failed"

In [4]:
ngrok.set_auth_token("2wtPJIWVn0SinCkBiMqqqLkQpTc_3RVbVD7KUkQKHURdW4t1a")

In [5]:
# Flask app
app = Flask(__name__)
UPLOAD_FOLDER = "uploads"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

In [None]:
DEFAULT_PROMPT = "Please summarize the dialogue and events in this anime image clearly and informatively. Be concise and accurate, using up to 25 words."


In [8]:
@app.route("/extract-text", methods=["POST"])
def extract_text():
    print("🔥 /extract-text route hit!")
    if 'file' not in request.files:
        return jsonify({"error": "No file part"}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({"error": "No selected file"}), 400

    try:
        filename = secure_filename(file.filename)
        filepath = os.path.join(UPLOAD_FOLDER, filename)
        file.save(filepath)

        # Load image
        try:
            image = Image.open(filepath).convert("RGB")
        except Exception:
            return jsonify({"error": "Unsupported file format or unreadable image"}), 400

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": DEFAULT_PROMPT},
                    {"type": "image", "image": image}
                ]
            }
        ]

        summary = infer(messages)
        return jsonify({"text": summary})

    except Exception as e:
        print("❌ Exception:", e)
        return jsonify({"error": "Unsupported file format or OCR failed"}), 500


def run_colab_api():
    public_url = ngrok.connect(5000)
    print("Public URL:", public_url)
    app.run(port=5000)

In [9]:
run_colab_api()

Public URL: NgrokTunnel: "https://933d076b1b72.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


🔥 /extract-text route hit!


INFO:werkzeug:127.0.0.1 - - [13/Jul/2025 00:12:19] "POST /extract-text HTTP/1.1" 200 -
