In [1]:
%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git

In [2]:
%%capture
%cd /content/IndicTrans2/huggingface_interface

In [3]:
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone https://github.com/VarunGumma/IndicTransToolkit.git
%cd IndicTransToolkit
!python3 -m pip install --editable ./
%cd ..

In [4]:
!pip install pymongo sentence-transformers torch transformers langchain_community pymupdf tools --quiet


**After Intsalling above packages, restart the run time and run the below code **

# PDF Upload

In [3]:
!pip install --quiet pymupdf

import pymupdf
import torch
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer
from google.colab import files

# === MongoDB ===
mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/"
client = MongoClient(mongo_uri)
temp_coll = client["msme_schemes_db"]["uploaded_pdf_temp"]

# === Embedding Model ===
embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")

# === Upload PDF ===
uploaded = files.upload()
pdf_path = next(iter(uploaded))

# === Chunk Function ===
def chunk_text(text, chunk_size=350, overlap=50):
    tokens = text.split()
    chunks = []
    i = 0
    while i < len(tokens):
        chunks.append(" ".join(tokens[i:i+chunk_size]))
        i += chunk_size - overlap
    return chunks

# === Process PDF ===
try:
    doc = pymupdf.open(pdf_path)
    full_text = "\n".join([page.get_text().strip() for page in doc])
    if not full_text:
        print("❌ No extractable text found. PDF might be scanned.")
    else:
        chunks = chunk_text(full_text)
        doc_chunks = [{"chunk_id": i, "chunk_text": c, "embedding": embed_model.encode(c).tolist()} for i, c in enumerate(chunks)]
        temp_coll.delete_many({})
        temp_coll.insert_one({"source": "user_uploaded", "rag_chunks": doc_chunks})
        print(f"✅ Stored {len(doc_chunks)} chunks in MongoDB.")
except Exception as e:
    print(f"❌ PDF processing failed: {e}")


Saving T-PRIDE Scheme.pdf to T-PRIDE Scheme (2).pdf
✅ Stored 93 chunks in MongoDB.


In [None]:
import gradio as gr
import torch, re
from pymongo import MongoClient
from datetime import datetime
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, pipeline,
    AutoModelForSeq2SeqLM
)
from sentence_transformers import SentenceTransformer, util
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import HuggingFacePipeline
from IndicTransToolkit.processor import IndicProcessor
from peft import PeftModel

# === MongoDB Setup ===
mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/"
client = MongoClient(mongo_uri)
db = client["msme_schemes_db"]
udyam_coll = db["udyam_profiles"]
schemes_chunk_coll = db["schemes_chunks_only"]
schemes_info_coll = db["schemes_embedded"]
query_logs_coll = db["query_logs"]
temp_coll = db["uploaded_pdf_temp"]

# === LLM + Embeddings ===
tokenizer = AutoTokenizer.from_pretrained("Vipplav/gemma-finetuned-faq", use_fast=True)
base_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b-it", device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model = PeftModel.from_pretrained(
    base_model, "Vipplav/gemma-finetuned-faq", device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
generator = pipeline(
    "text-generation", model=model, tokenizer=tokenizer,
    max_new_tokens=150, do_sample=False
)
llm = HuggingFacePipeline(pipeline=generator)
embed_model = SentenceTransformer(
    "BAAI/bge-small-en-v1.5",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# === IndicTrans2 Setup ===
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
ip = IndicProcessor(inference=True)
translator_tokenizer = AutoTokenizer.from_pretrained(
    "ai4bharat/indictrans2-en-indic-1B", trust_remote_code=True
)
translator_model = AutoModelForSeq2SeqLM.from_pretrained(
    "ai4bharat/indictrans2-en-indic-1B", trust_remote_code=True
).to(DEVICE).eval()

def translate_to_telugu(text):
    if not text.strip(): return "⚠️ Nothing to translate."
    batch = ip.preprocess_batch([text], src_lang="eng_Latn", tgt_lang="tel_Telu")
    inputs = translator_tokenizer(batch, return_tensors="pt", padding=True).to(DEVICE)
    with torch.no_grad():
        outputs = translator_model.generate(**inputs, max_length=256, num_beams=5)
    decoded = translator_tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return ip.postprocess_batch(decoded, lang="tel_Telu")[0]

# === Prompt Template ===
rephrase_template = PromptTemplate.from_template("""
You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
Only return the query. Avoid comments.
Enterprise Profile:
{profile_summary}
""")

# === MSME Utilities ===
def normalize_udyam(uid): return uid.strip().upper().replace(" ", "")
def is_valid_udyam(uid): return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
def get_profile_by_uid(uid):
    uid = normalize_udyam(uid)
    return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0}) if is_valid_udyam(uid) else None

def summarize_profile(p):
    return (
        f"The user represents an enterprise named '{p['Enterprise Name']}', based in {p['State']}, "
        f"operating in the {p['Major Activity']} sector. They identify as {p['Gender']}, run a "
        f"{p['Enterprise Type']} sized {p['Organisation Type'].lower()} organization. The enterprise has "
        f"{p['Employment']} employees, with an investment of ₹{p['Investment Cost (In Rs.)']:,} and a turnover "
        f"of ₹{p['Net Turnover (In Rs.)']:,}."
    )

def generate_search_query(profile):
    summary = summarize_profile(profile)
    q = llm.invoke(rephrase_template.format(profile_summary=summary)).strip().split("\n")[0].strip()
    return q, summary

def get_top_matching_schemes(q, top_k=5):
    qe = embed_model.encode(q, convert_to_tensor=True)
    scores = []
    for doc in schemes_chunk_coll.find({"rag_chunks": {"$exists": True}}):
        for chunk in doc["rag_chunks"]:
            if "embedding" in chunk:
                ce = torch.tensor(chunk["embedding"]).to(qe.device)
                score = util.cos_sim(qe, ce)[0][0].item()
                scores.append((score, doc["scheme_id"], doc["scheme_name"]))
    seen, out = set(), []
    for score, sid, name in sorted(scores, key=lambda x: x[0], reverse=True):
        if sid not in seen:
            out.append({"score": score, "scheme_id": sid, "scheme_name": name})
            seen.add(sid)
        if len(out) == top_k:
            break
    return out

def fetch_scheme_field_llm(scheme_id, query):
    fmap = {
        "eligibility": "eligibility_list",
        "benefits": "key_benefits_list",
        "assistance": "assistance_list",
        "apply": "how_to_apply_list",
        "documents": "required_documents_list"
    }
    key = next((v for k,v in fmap.items() if k in query.lower()), None)
    doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
    if key and doc and key in doc:
        text = "\n".join(doc[key][:5])
        p = (
            f"Summarize for business owners:\nScheme: {doc['scheme_name']}\n"
            f"Section: {key.replace('_list','').title()}\n\n{text}"
        )
        return llm.invoke(p).strip()
    return "❌ Ask eligibility, benefits, how to apply, or documents."

# === State ===
chat_state = {"stage":0,"profile":{}, "scheme_id":None,"last_bot_msg":"", "summary":""}
pdf_state  = {"last_pdf_msg":""}

# === MSME Chatbot ===
# === MSME Chatbot Logic (store last_bot_msg for all replies) ===
def chatbot(msg, history):
    # decide reply based on stage
    if chat_state["stage"] == 0:
        response = "👋 Enter Udyam ID or type 'manual'."
        chat_state["stage"] = 1

    elif chat_state["stage"] == 1:
        if msg.lower().startswith("udyam-"):
            profile = get_profile_by_uid(msg)
            if profile:
                summary = summarize_profile(profile)
                response = f"✅ Profile loaded:\n{summary}\nType 'show related schemes'."
                chat_state.update({"profile": profile, "stage": 3, "summary": summary})
            else:
                response = "❌ Invalid Udyam ID. Try again or type 'manual'."
        elif "manual" in msg.lower():
            response = "📝 What's your enterprise name?"
            chat_state["stage"] = 2
        else:
            response = "Please enter a valid Udyam ID or 'manual'."

    elif chat_state["stage"] == 2:
        fields = [
            "Enterprise Name","Gender","Enterprise Type","Organisation Type",
            "Major Activity","State","Investment Cost (In Rs.)",
            "Net Turnover (In Rs.)","Employment"
        ]
        idx = len(chat_state["profile"])
        key = fields[idx]
        chat_state["profile"][key] = int(msg) if any(x in key for x in ["Cost","Turnover","Employment"]) else msg
        if len(chat_state["profile"]) == len(fields):
            summary = summarize_profile(chat_state["profile"])
            response = f"✅ Profile saved:\n{summary}\nType 'show related schemes'."
            chat_state.update({"stage": 3, "summary": summary})
        else:
            response = f"{fields[idx+1]}?"

    elif chat_state["stage"] == 3 and "scheme" in msg.lower():
        query, _ = generate_search_query(chat_state["profile"])
        results = get_top_matching_schemes(query)
        if not results:
            response = "⚠️ No schemes matched."
        else:
            response = "📈 Recommended Schemes:\n" + "\n".join(
                f"{i+1}. {r['scheme_name']} (Score: {round(r['score'],4)})"
                for i, r in enumerate(results)
            ) + "\nAsk about eligibility, docs, or apply."
            chat_state.update({"scheme_id": results[0]["scheme_id"], "stage": 4})

    elif chat_state["stage"] == 4:
        response = fetch_scheme_field_llm(chat_state["scheme_id"], msg)

    else:
        response = "⚠️ Unexpected state. Please restart."

    # store and return
    chat_state["last_bot_msg"] = response
    return response

# === Translate Last Scheme Response ===
def translate_last_response():
    # now always has something if chatbot() was called at least once
    return translate_to_telugu(chat_state["last_bot_msg"])

# === PDF Q&A ===
def query_pdf(question):
    doc = temp_coll.find_one({"source": "user_uploaded"})
    if not doc or "rag_chunks" not in doc:
        pdf_state["last_pdf_msg"] = "⚠️ No PDF chunks found."
        return pdf_state["last_pdf_msg"]

    # 1) Embed the question
    qv = embed_model.encode(question, convert_to_tensor=True)

    # 2) Score each chunk
    scored = []
    for c in doc["rag_chunks"]:
        if "embedding" in c:
            score = util.cos_sim(qv, torch.tensor(c["embedding"]).to(qv.device))[0][0].item()
            scored.append((score, c["chunk_text"]))

    if not scored:
        pdf_state["last_pdf_msg"] = "⚠️ No embeddings to compare."
        return pdf_state["last_pdf_msg"]

    # 3) Pick top-3 chunks
    top = sorted(scored, key=lambda x: x[0], reverse=True)[:3]
    context = "\n---\n".join([t[1] for t in top])

    # 4) New prompt: instruct model NOT to repeat context
    prompt = f"""You are a knowledgeable assistant. Use the following context to answer the question **briefly**.
**Do not** include the context in your answer—only output the answer itself.

Context:
{context}

Question: {question}

Answer:"""

    # 5) Invoke LLM and extract answer
    full = llm.invoke(prompt)
    answer = full.split("Answer:")[-1].strip()

    # 6) Save and return
    pdf_state["last_pdf_msg"] = answer
    return answer


def translate_pdf_response():
    return translate_to_telugu(pdf_state["last_pdf_msg"]) if pdf_state["last_pdf_msg"] else "⚠️ Nothing to translate."

# === Gradio UI ===
with gr.Blocks() as demo:
    gr.Markdown("## 🤖 MSME Scheme Assistant")
    gr.ChatInterface(fn=chatbot, title="💬 MSME Chatbot")

    tbtn=gr.Button("🌐 Translate Last Scheme Reply")
    tout=gr.Textbox(label="🗣️ Telugu Translation",lines=3)
    tbtn.click(fn=translate_last_response, outputs=tout)

    gr.Markdown("## 📄 Chat with Uploaded PDF")
    pdf_btn=gr.Button("📄 Enable PDF Chat")
    pdf_input=gr.Textbox(label="Ask PDF question",visible=False)
    pdf_ask=gr.Button("Ask",visible=False)
    pdf_out=gr.Textbox(label="📜 PDF Answer",lines=6,visible=False)
    pdf_trans_btn=gr.Button("🌐 Translate PDF Answer",visible=False)
    pdf_trans_out=gr.Textbox(label="🗣️ Telugu PDF Translation",lines=3,visible=False)

    def show_pdf_ui():
        return [gr.update(visible=True)]*5

    pdf_btn.click(fn=show_pdf_ui,
                  outputs=[pdf_input,pdf_ask,pdf_out,pdf_trans_btn,pdf_trans_out])
    pdf_ask.click(fn=query_pdf,inputs=pdf_input,outputs=pdf_out)
    pdf_trans_btn.click(fn=translate_pdf_response,outputs=pdf_trans_out)

demo.launch(debug=True)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://1a0e9f8bfd6c5b2e36.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
