In [None]:
# =============================
# üß† 1. Install dependencies
# =============================
%pip install -r requirements.txt
%pip install groq


In [None]:
# =============================
# ‚öôÔ∏è 2. Import libraries
# =============================
import os
from tqdm import tqdm
import torch
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from langchain_core.embeddings import Embeddings
from qdrant_client import QdrantClient

from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from sentence_transformers import SentenceTransformer

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

In [None]:
# =============================
# üîê 3. Set credentials manually (since Kaggle doesn't keep .env files), if trained locally, change code accordingly, 
# use .env file and dotenv()
# =============================
# üëâ Replace with your actual credentials (if training on Kaggle since it does not store .env files)
os.environ["QDRANT_URL"] = "https://your-instance.qdrant.tech"
os.environ["QDRANT_API_KEY"] = "your_qdrant_api_key"
os.environ["GROQ_API_KEY"] = "your_groq_api_key"

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

assert QDRANT_URL and QDRANT_API_KEY and GROQ_API_KEY, "‚ùå Missing credentials!"

In [None]:
# =============================
# üìÇ 4. Load your PDFs
# =============================
pdf_folder = "/kaggle/input/pdf-argo"  # <-- change to your dataset path
all_docs = []

for file_name in os.listdir(pdf_folder):
    if file_name.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(pdf_folder, file_name))
        docs = loader.load()
        for d in docs:
            d.metadata["source"] = file_name
        all_docs.extend(docs)

print(f"‚úÖ Loaded {len(all_docs)} pages from {pdf_folder}")

In [None]:
# =============================
# ‚úÇÔ∏è 5. Split into text chunks
# =============================
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(all_docs)
print(f"‚úÖ Created {len(chunks)} text chunks.")


In [None]:
# =============================
# ‚ö° 6. Setup GPU Embedding Model (BGE-M3)
# =============================
device = "cuda" if torch.cuda.is_available() else "cpu"
embedder = SentenceTransformer("BAAI/bge-m3", device=device)
print("‚úÖ Embedding dimension:", embedder.get_sentence_embedding_dimension())

# LangChain wrapper
class LangChainBGE(Embeddings):
    def __init__(self, model): self.model = model
    def __call__(self, texts): return self.model.encode(texts, show_progress_bar=False, convert_to_numpy=True).tolist()
    def embed_documents(self, texts): return self.model.encode(texts, show_progress_bar=False, convert_to_numpy=True).tolist()
    def embed_query(self, text): return self.model.encode(text, convert_to_numpy=True).tolist()

embedding_model = LangChainBGE(embedder)

In [None]:
# =============================
# üíæ 7. Upload to Qdrant Cloud
# =============================
client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    timeout=43200.0  # increase to avoid timeouts
)

collection_name = "argo_papers_real"

print(f"üì¶ Uploading {len(chunks)} chunks to Qdrant Cloud collection '{collection_name}'...")
batch_size = 500
for i in tqdm(range(0, len(chunks), batch_size)):
    batch = chunks[i:i+batch_size]
    Qdrant.from_documents(
        documents=batch,
        embedding=embedding_model,
        url=QDRANT_URL,          
        api_key=QDRANT_API_KEY,  
        collection_name=collection_name
    )
print("‚úÖ All chunks uploaded successfully to Qdrant Cloud.")

In [None]:
# =============================
# üß† 8. Build Retriever + LLM (LCEL)
# =============================
llm = ChatGroq(
    groq_api_key=GROQ_API_KEY,
    model="llama-3.3-70b-versatile",  # current supported model
    temperature=0.1,
    max_tokens=512
)

retriever = Qdrant(
    client=client,
    collection_name=collection_name,
    embeddings=embedding_model
).as_retriever(search_kwargs={"k": 5})

prompt = ChatPromptTemplate.from_template("""
You are a scientific assistant. Use the retrieved context to answer the question precisely and concisely.

Context:
{context}

Question:
{question}
                                          
Answer:
""")

rag_chain = (
    {"context": retriever | (lambda docs: "\n\n".join(d.page_content for d in docs)),
     "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# =============================
# üí¨ 9. Ask a question
# =============================
question = "How have Argo floats improved deep ocean observation?"
answer = rag_chain.invoke(question)

print("Q:", question)
print("A:", answer)