<a href="https://colab.research.google.com/github/zmengjie/ML-Web-Interface/blob/main/mistral_llm_api_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install fastapi uvicorn nest_asyncio ctransformers



In [2]:
!pip install sentence-transformers faiss-cpu




In [5]:
# 🔎 STEP 2–6: Local RAG with SentenceTransformer + FAISS

!pip install sentence-transformers faiss-cpu --quiet

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# 🧠 Step 1: Prepare your local knowledge base
docs = [
    "Newton's Method is an iterative algorithm used to find the roots of a real-valued function using its derivative.",
    "The update rule is x_{n+1} = x_n - f(x_n)/f'(x_n).",
    "Backtracking line search adjusts the step size to satisfy the Armijo condition.",
    "Gradient descent finds the local minimum by following the negative gradient of a function.",
    "The Armijo condition ensures sufficient decrease in loss during optimization."
    # Add more chunks from .txt/.md files if needed
]

# 🧠 Step 2: Build vector index
embedder = SentenceTransformer("all-MiniLM-L6-v2")
doc_embeddings = embedder.encode(docs, normalize_embeddings=True)
index = faiss.IndexFlatIP(doc_embeddings.shape[1])
index.add(doc_embeddings)

# 🔍 Step 3: Define retrieval function
def retrieve_chunks(query, top_k=3):
    query_embedding = embedder.encode([query], normalize_embeddings=True)
    scores, indices = index.search(np.array(query_embedding), top_k)
    return [docs[i] for i in indices[0]]

# 🧠 Step 4: Use RAG-enhanced prompt template
def format_prompt_with_rag(user_query):
    context = "\n".join(retrieve_chunks(user_query))
    return (
        "You are a helpful assistant specializing in mathematics and optimization.\n\n"
        f"Background:\n{context}\n\n"
        f"Question: {user_query.strip()}\n\nAnswer:"
    )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# 🚀 STEP 1: Load Mistral-7B-Instruct Model
from ctransformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
    model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf",  # Use Q5_1 if GPU supports
    model_type="mistral",
    gpu_layers=40  # Tune for Colab T4 (40) or A100 (60+)
)

# 🚀 STEP 2: FastAPI + Background Uvicorn Setup
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
import nest_asyncio
import threading
import re

nest_asyncio.apply()
app = FastAPI()

# === Input schema ===
class Prompt(BaseModel):
    query: str

# === Utility: Role-based prompt template ===
def format_prompt(user_query):
    return format_prompt_with_rag(user_query)

# === Endpoint for generating answers ===
@app.post("/generate")
def generate(prompt: Prompt):
    try:
        final_prompt = format_prompt(prompt.query)

        raw_output = model(final_prompt, max_new_tokens=2048)
        cleaned_output = raw_output.strip()

        # Retry if clearly too short or ends poorly
        if len(cleaned_output.split()) < 40 or cleaned_output.strip().endswith(
            ("and", "or", "with", "to", "from", "in", "of", ",")
        ):
            raw_output_retry = model(final_prompt, max_new_tokens=1600)
            cleaned_output = raw_output_retry.strip()

        # Trim to last full sentence
        if not cleaned_output.endswith(('.', '?', '!')):
            cleaned_output = re.sub(r'([.?!])[^.?!]*$', r'\1', cleaned_output)

        return {"response": cleaned_output}

    except Exception as e:
        return {"response": f"❌ Error generating response: {str(e)}"}

# === Background server thread ===
def run_api():
    uvicorn.run(app, host="0.0.0.0", port=7860)

threading.Thread(target=run_api).start()



Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
!ngrok config add-authtoken 2zhGUPPHdJjWFhhRWpymzzjR9KW_3kUwSXK9FuKjSDSgvAgut


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
!npm install -g localtunnel

# Launch tunnel (choose any available subdomain or let it randomize)
!lt --port 7860 --subdomain your-llm-app-name


[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K
changed 22 packages in 4s
[1G[0K⠙[1G[0K
[1G[0K⠙[1G[0K3 packages are looking for funding
[1G[0K⠙[1G[0K  run `npm fund` for details
[1G[0K⠙[1G[0Kyour url is: https://your-llm-app-name.loca.lt
INFO:     35.197.92.111:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     35.197.92.111:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     35.197.92.111:0 - "POST /generate HTTP/1.1" 200 OK
INFO:     35.197.92.111:0 - "POST /generate HTTP/1.1" 200 OK
