In [2]:
%pip install -r requirements.txt
%pip install groq

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

pdf_folder = "./data/pdfs"

all_documents = []
for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(pdf_folder, file))
        documents = loader.load()
        all_documents.extend(documents)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
texts = text_splitter.split_documents(all_documents)

print(f"Loaded {len(all_documents)} documents.")
print(f"Split into {len(texts)} chunks.")


Loaded 58 documents.
Split into 292 chunks.


In [4]:
from pypdf import PdfReader

def extract_metadata(pdf_path):
    reader = PdfReader(pdf_path)
    info = reader.metadata or {}
    metadata = {
        "title": info.get("/Title", os.path.basename(pdf_path)),
        "authors": info.get("/Author", "Unknown"),
        "year": info.get("/CreationDate", "Unknown")[2:6] if info.get("/CreationDate") else "Unknown",
        "journal": info.get("/Subject", "Unknown"),
        "DOI": info.get("/Keywords", "N/A"),
    }
    return metadata

metadata_list = []
for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        meta = extract_metadata(os.path.join(pdf_folder, file))
        metadata_list.append(meta)

print(f"Extracted metadata for {len(metadata_list)} PDFs.")
print(metadata_list[:2])


Extracted metadata for 4 PDFs.
[{'title': 'Anthropogenic carbon pathways towards the North Atlantic interior revealed by Argo-O2, neural networks and back-calculations', 'authors': 'Rémy Asselot', 'year': '2024', 'journal': 'Nature Communications, doi:10.1038/s41467-024-46074-5', 'DOI': ''}, {'title': 'Impact_of_ocean_in-situ_observations_on_ECMWF_sub-seasonal_forecasts.pdf', 'authors': 'Unknown', 'year': 'Unknown', 'journal': 'Unknown', 'DOI': 'ocean in-situ observational impact, Argo observations, ocean observing system experiment, sub-seasonal forecasts, coupled prediction, ocean reanalysis, ocean initialization, observing system impact'}]


In [5]:
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from dotenv import load_dotenv
from langchain_core.embeddings import Embeddings

# Load environment variables
load_dotenv()

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

if not QDRANT_URL or not QDRANT_API_KEY:
    raise ValueError("❌ Please set QDRANT_URL and QDRANT_API_KEY in your .env file.")

# Initialize Qdrant Cloud client
qdrant = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# Load BGE-M3 multilingual embedding model
model_name = "BAAI/bge-m3"
embedder = SentenceTransformer(model_name)
print("Embedding dimension:", embedder.get_sentence_embedding_dimension())

class LangChainBGE(Embeddings):
    def __init__(self, model):
        self.model = model

    def __call__(self, texts):
        """Allows Qdrant to call this directly."""
        return self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True).tolist()

    def embed_documents(self, texts):
        return self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True).tolist()

    def embed_query(self, text):
        return self.model.encode(text, convert_to_numpy=True).tolist()

embedding_model = LangChainBGE(embedder)

# Create vector store in Qdrant Cloud
vector_store = Qdrant.from_documents(
    documents=texts,
    embedding=embedding_model,
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    collection_name="argo_papers",
    timeout=600.0
)
test_texts = ["Ocean currents are vital for climate regulation."]
print(embedding_model(test_texts)[:1])  # should print embedding vector
print(embedding_model.embed_query("test"))  # should also work
print("✅ Vector store created with multilingual embeddings on Qdrant Cloud.")


Embedding dimension: 1024


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.40it/s]
Batches: 100%|██████████| 2/2 [01:25<00:00, 42.96s/it]
Batches: 100%|██████████| 2/2 [01:03<00:00, 31.75s/it]
Batches: 100%|██████████| 2/2 [01:24<00:00, 42.06s/it]
Batches: 100%|██████████| 2/2 [01:03<00:00, 31.85s/it]
Batches: 100%|██████████| 2/2 [00:28<00:00, 14.04s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.61it/s]


[[-0.02155338227748871, 0.0012976379366591573, -0.053410258144140244, -0.011415092274546623, -0.03711840882897377, -0.0571223720908165, -0.014707711525261402, -0.034606385976076126, 0.0027110001537948847, 0.018590224906802177, 0.008445465005934238, -0.045655641704797745, -0.0142004219815135, -0.01098041981458664, -0.037509702146053314, -0.02951253205537796, 0.05812929570674896, 0.04718567430973053, 0.001218492747284472, -0.0447726771235466, -0.02696315012872219, -0.03046538680791855, -0.03156543895602226, -0.0059911641292274, -0.003996940795332193, 0.041935887187719345, 0.009384426288306713, -0.003861492034047842, 0.01640026830136776, -0.023822389543056488, 0.010142583400011063, 0.07377247512340546, -0.0052836802788078785, 0.008657789789140224, -0.026846138760447502, 0.01041070744395256, -0.008035484701395035, -0.021378537639975548, -0.038209185004234314, 0.004749163519591093, -0.04990075156092644, -0.06725063174962997, 0.018013780936598778, -0.013289708644151688, 0.001200728234834969,

In [6]:
query = "Recent developments in Argo profiling floats for ocean temperature measurement"
results = vector_store.similarity_search(query, k=5)

for idx, doc in enumerate(results, 1):
    print(f"\nResult {idx}:")
    print(doc.metadata)
    print(doc.page_content[:400], "...")



Result 1:
{'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'keywords': 'ocean in-situ observational impact, Argo observations, ocean observing system experiment, sub-seasonal forecasts, coupled prediction, ocean reanalysis, ocean initialization, observing system impact', 'source': './data/pdfs\\Impact_of_ocean_in-situ_observations_on_ECMWF_sub-seasonal_forecasts.pdf', 'total_pages': 9, 'page': 7, 'page_label': '8', '_id': '4bcd0f8d-87da-4a5c-abdc-1916571cd6c7', '_collection_name': 'argo_papers'}
15, e2022MS003044. doi:10.1029/2022MS003044
Fujii, Y., Ré m y ,E . ,Z u o ,H . ,O k e ,P . ,H a l l i w e l l ,G . ,G a s p a r i n ,F . ,e ta l .( 2 0 1 9 ) .
Observing system evaluation based on ocean data assimilation and prediction
systems: on-going challenges and a future vision for designing and supporting
ocean observational networks. Front. Mar. Sci. 6. doi: 10.3389/fmars.
2019.00417
Goul ...

Result 2:
{'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'keywor

In [11]:
from langchain_community.llms import HuggingFaceHub
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# ✅ Check for Groq API key
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
    raise ValueError("❌ GROQ_API_KEY not found in .env file")

# ✅ Initialize Groq LLM
llm = ChatGroq(
    groq_api_key=groq_api_key,
    model="llama-3.3-70b-versatile",   # or "mixtral-8x7b-32768"
    temperature=0.1,
    max_tokens=512
)

# ✅ Build Retriever (assuming you already have a vector store)
retriever = vector_store.as_retriever(search_kwargs={"k": 5})

# Prompt template for retrieval-augmented generation
prompt = ChatPromptTemplate.from_template("""
You are an expert research assistant. Use the following retrieved context to answer the question accurately and explain in depth.

Context:
{context}

Question:
{question}

Answer:
""")

# ✅ Construct the LCEL chain (Retriever → Prompt → LLM → Parser)
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever | (lambda docs: "\n\n".join(d.page_content for d in docs)),
     "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# ✅ Run the chain
question = "What are argo floats?"
answer = rag_chain.invoke(question)

print("Q:", question)
print("A:", answer)



Q: What are argo floats?
A: Argo floats are autonomous underwater vehicles that measure various oceanographic parameters, such as temperature, salinity, and other biogeochemical properties. They are equipped with sensors that collect data as they drift through the ocean, typically between the surface and a maximum depth of 2000 decibars (dbar). The data collected by Argo floats is used to improve our understanding of ocean dynamics, biogeochemical cycles, and climate change.

There are different types of Argo floats, including:

1. **Biogeochemical-Argo (BGC-Argo) floats**: These floats are equipped with additional sensors to measure biogeochemical parameters such as pH, nitrate, and oxygen levels.
2. **Argo-O2 floats**: These floats are equipped with oxygen sensors to measure dissolved oxygen levels in the ocean.

Argo floats typically operate in the following way:

1. They descend to a maximum depth, collecting data as they go.
2. They remain at the maximum depth for a period of time