In [None]:
from pymongo.mongo_client import MongoClient

uri = "mongodb+srv://<user>:<password>@espi.bebf5dd.mongodb.net/?retryWrites=true&w=majority&appName=Espi"

# Create a new client and connect to the server
client = MongoClient(uri)

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [2]:
db=client["mytestdb"]
collection=db["mytestcollection"]

In [6]:
from langchain_google_genai import ChatGoogleGenerativeAI
import os
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from collections import defaultdict
from bson.objectid import ObjectId
from langchain.schema import SystemMessage, HumanMessage, AIMessage
import re

In [None]:
PINECONE_API_KEY=os.environ['PINECONE_API_KEY']
from pinecone import Pinecone

pc=Pinecone(api_key=PINECONE_API_KEY)
index=pc.Index("rag-chat-bot")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
GOOGLE_API_KEY=os.environ['GOOGLE_API_KEY']

In [9]:
from sentence_transformers import SentenceTransformer
embedding_model=SentenceTransformer('thenlper/gte-large')

In [None]:
llm=ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=GOOGLE_API_KEY)

In [7]:
response_metadata: dict[str, list] = defaultdict(list)

In [None]:
def get_result(query, similar_result=5):
    embedding=embedding_model.encode(query)
    embedding=embedding.tolist()

    result=index.query(
        vector=embedding,
        top_k=similar_result,
        include_metadata=True,
        include_values=False
    )

    return result

def build_context_from_matches(matches: list) -> str:
    docs=[]
    for m in matches:
        doc=collection.find_one({"_id": ObjectId(m["id"])})
        content=doc.get("content", "")
        keywords=doc.get("keywords", [])
        docs.append(f"Content: {content}\nKeywords: {', '.join(keywords)}")
    return "\n\n".join(docs)

def refine_query_with_llm(history: list[HumanMessage|AIMessage], question: str) -> str:
    convo="\n".join(
        f"User: {h.content}" if isinstance(h, HumanMessage) else f"Assistant: {h.content}"
        for h in history[-4:]
    )
    prompt_text=(
        "Given a chat history and the latest user question which might reference context in the chat history,"
        "formulate a standalone question which can be understood without the chat history."
        "Do NOT answer the question, just reformulate it if needed and otherwise return it as is.\n\n"
        f"Chat History:\n{convo}\nUser: {question}\n\nSearch query:"
    )
    response=llm.invoke([HumanMessage(content=prompt_text)])
    return response.content.strip()

def extract_keywords(text: str) -> list[str]:
    links=re.findall(r"\[([^]]+)\]\([^)]*\)", text)
    if links:
        return links
    prompt=(
        "Extract 3–5 keywords (comma-separated) from this answer:\n\n"+text
    )
    kw_resp=llm.invoke([HumanMessage(content=prompt)]).content
    return [k.strip() for k in kw_resp.split(",")]

In [None]:
user_histories: dict[str, list[HumanMessage|AIMessage]]=defaultdict(list)

In [None]:
SYSTEM=SystemMessage(
    content=(
        '''You are an assistant to answer questions about an AI Internship program.
           Basically your job is to make familiar the AI interns with the internship programs and solve their doubts regarding the internship schedule, training documents, programs, certifications and faq.
           Use the following pieces of retrieved context and the keywords with the given chat history to answer the question.
           Provide the context links wherever necessary from the context.
           If you don't know the answer, say that you don't know.
        '''
))

In [None]:
def ask_question(user_id: str, question: str) -> str:
    history=user_histories[user_id]

    raw_query=refine_query_with_llm(history, question)

    resp=get_result(raw_query, similar_result=5)
    context=build_context_from_matches(resp["matches"])

    messages=[SYSTEM]+history+[
        HumanMessage(content=f"Context:\n{context}\n\nQuestion: {question}")
    ]

    ai_msg: AIMessage=llm.invoke(messages)

    keywords=extract_keywords(ai_msg.content)

    history.append(HumanMessage(content=question))
    history.append(ai_msg)
    response_metadata[user_id].append({
        "question": question,
        "answer": ai_msg.content,
        "keywords": keywords
    })

    return ai_msg.content


In [14]:
user="sumanta"
print(ask_question(user, "For how many months is the internship program planned?"))

The internship program is planned for 3 months.


In [15]:
print(ask_question(user, "Can you provide me the detailed tabular weekly plannars for this internship program?"))

Certainly! Here are the detailed weekly planners based on the provided information.

Please note there are two types of weekly plans: general intern expectations and a bootcamp schedule. All interns are encouraged to attend mentor office hours, but not the group training sessions for PMs in the bootcamp.

### Weekly Intern Expectations (for Designers, Data Scientists & Engineers)

**Week 1:**
*   **All Interns:**
    *   Watch onboarding and learning videos based on your role (Designer or Engineer).
        *   [Engineers’ Training Playlist](https://www.youtube.com/playlist?list=PLr7J0CdhrYUvc-pfs9j6sdrjV6iYW-V2q)
        *   [Designers’ Training Playlist](https://www.youtube.com/playlist?list=PLr7J0CdhrYUts5UWKgndNWTbf7z54Errl)
    *   Make sure to attend Office Hours to speak with mentors to discuss your progress.
*   **Engineers:**
    *   Choose one of the two products provided in Discord: Discord Rag Chatbot or Job Tracker Agent(s).
    *   Join the Discord channel to discuss your