In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
import duckdb
import pandas as pd
import tqdm as notebook_tqdm
import os,sys


# Notebook to build a RAG pipeline for UTMB data
The idea is to build an Agent system that can answer questions about UTMB races. For this we will use an hybrid approach:
1. If the questions is answerable from the db then we will use an LLM to generate a query and query the db. 
    example of question: "What race offers a 50km run in June?" --> Easily queriable. 
2. If it is a more vague question, we will use a RAG approach. 
    example of question: "What is the most challenging race? " --> Not easily queriable.

The agent will then decide which approach to use based on the question. A potential 3rd approach could be to use a combination of both, query the db to filter the resutls and then use the RAG approach to answer the question.
    example of question: "What is the most challenging 50km race that I can do over the summer?" --> This could be answered by first querying the db to get all the 50k runs in summer and then using the RAG approach to answer the question.

In [58]:
model_name = 'intfloat/e5-small-v2'
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"})
db = duckdb.connect("../data_test/utmb_db.duckdb")

In [59]:
db.sql("select id from UTMB")

┌────────────┐
│     id     │
│   int64    │
├────────────┤
│          1 │
│          2 │
│          3 │
│          4 │
│          5 │
│          6 │
│          7 │
│          8 │
│          9 │
│         10 │
│          · │
│          · │
│          · │
│         40 │
│         41 │
│         42 │
│         43 │
│         44 │
│         45 │
│         46 │
│         47 │
│         48 │
│         49 │
├────────────┤
│  49 rows   │
│ (20 shown) │
└────────────┘

In [60]:
db.sql("DESCRIBE all tables")

┌──────────┬─────────┬─────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [61]:
data = db.sql("SELECT * FROM UTMB").df()

In [62]:
data.head()

Unnamed: 0,id,name,date_confirmed,country,city,image,link,distance_5,distance_10,distance_15,...,start_day,end_day,month,year,duration,latitude,longitude,description,embeddings,metadata
0,1,Nice Côte d'Azur,True,France,Nice(06),https://res.cloudinary.com/kavval/image/upload...,https://www.finishers.com/en/event/nice-cote-d...,False,False,False,...,26.0,28.0,9.0,2025.0,2.0,43.700936,7.268391,passage: Nice Côte d'Azur takes place in Nice(...,"[-0.06962153, 0.042183343, 0.055064175, 0.0448...","{ ""name"": ""Nice Côte d'Azur"" }"
1,2,Kaçkar,True,Republic of Türkiye,Ayder,https://res.cloudinary.com/kavval/image/upload...,https://www.finishers.com/en/event/kackar-by-u...,False,False,False,...,27.0,27.0,9.0,2025.0,1.0,40.95252,41.102051,"passage: Kaçkar takes place in Ayder, Republic...","[-0.04515943, 0.010894476, 0.03974204, 0.05890...","{ ""name"": ""Kaçkar"" }"
2,3,KAT100 Austria,True,Austria,Kitzbühel,https://res.cloudinary.com/kavval/image/upload...,https://www.finishers.com/en/event/kat100-utmb,False,True,False,...,7.0,9.0,8.0,2025.0,2.0,47.446359,12.391147,passage: KAT100 Austria takes place in Kitzbüh...,"[-0.052351978, 0.03653495, 0.035622302, 0.0325...","{ ""name"": ""KAT100 Austria"" }"
3,4,UTMB®,True,France,Chamonix(74),https://res.cloudinary.com/kavval/image/upload...,https://www.finishers.com/en/event/utmb,False,False,True,...,25.0,29.0,8.0,2025.0,4.0,33.521298,-117.701083,"passage: UTMB® takes place in Chamonix(74), Fr...","[-0.041295115, 0.037600648, 0.04706601, 0.0074...","{ ""name"": ""UTMB®"" }"
4,5,Julian Alps Trail Run,False,Slovenia,Kranjska Gora,https://res.cloudinary.com/kavval/image/upload...,https://www.finishers.com/en/event/julian-alps...,False,True,True,...,,,9.0,2025.0,,46.485132,13.784396,passage: Julian Alps Trail Run takes place in ...,"[-0.079987764, 0.03809658, -0.018317556, 0.074...","{ ""name"": ""Julian Alps Trail Run"" }"


In [63]:
data.columns

Index(['id', 'name', 'date_confirmed', 'country', 'city', 'image', 'link',
       'distance_5', 'distance_10', 'distance_15', 'distance_20',
       'distance_25', 'distance_30', 'distance_35', 'distance_40',
       'distance_45', 'distance_50', 'distance_55', 'distance_60',
       'distance_65', 'distance_70', 'distance_75', 'distance_80',
       'distance_85', 'distance_90', 'distance_95', 'distance_100',
       'distance_105', 'distance_110', 'distance_115', 'distance_120',
       'distance_125', 'distance_130', 'distance_135', 'distance_140',
       'distance_145', 'distance_150', 'distance_155', 'distance_160',
       'distance_165', 'distance_170', 'distance_175', 'distance_250',
       'distance_300', 'style_Altitude', 'style_Asateam', 'style_Autonomy',
       'style_Capitals/largecities', 'style_Cascade', 'style_Castles',
       'style_Children&JuniorsCourse', 'style_CitytoCity', 'style_Cliffs',
       'style_Cross-border', 'style_Culture&History', 'style_Duo',
       'style_For

## RAG Pipeline 

Our documents are emebedded and stored in a DuckDB database. We will use the `langchain_community.vectorstores.duckdb` to query the embeddings.
The embeddings were generated using `intfloat/e5-small-v2`. So we will keep this. A specificity of that model is that "passage" and "query" have to be added to the text we are embedding. It has been done for the documents, but not for the queries. So we will add it to the queries before embedding them.

In [2]:
model_name = 'intfloat/e5-large-v2'
embeddings_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"})
vector_store = LangChainDuckDB(connection=db, table_name="UTMB", embedding=embeddings_model,vector_key='embeddings')

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'LangChainDuckDB' is not defined

In [65]:
question = "query: a scenic but tough race in switzerland"

embedded = embeddings_model.embed_query(question)


  return forward_call(*args, **kwargs)


In [66]:
a = db.execute("""SELECT *,array_distance(embeddings,cast(? AS FLOAT[384])) AS distance  FROM UTMB ORDER BY distance DESC LIMIT 5""",parameters=[embedded]).fetchdf()

In [67]:
a

Unnamed: 0,id,name,date_confirmed,country,city,image,link,distance_5,distance_10,distance_15,...,end_day,month,year,duration,latitude,longitude,description,embeddings,metadata,distance
0,26,Trail Running Festival Desert Rats,False,United States,Fruita(CO),https://res.cloudinary.com/kavval/image/upload...,https://www.finishers.com/en/event/trail-runni...,False,True,False,...,,4.0,2026.0,,39.15887,-108.728988,passage: Trail Running Festival Desert Rats ta...,"[-0.017592195, -0.009952028, 0.07496054, 0.043...","{ ""name"": ""Trail Running Festival Desert Rats"" }",0.71048
1,31,Quindío Trail Colombia,False,Colombia,Buena Vista,https://res.cloudinary.com/kavval/image/upload...,https://www.finishers.com/en/event/quindio-tra...,False,False,True,...,,5.0,2026.0,,42.744581,-95.112752,passage: Quindío Trail Colombia takes place in...,"[-0.097181804, 0.011473633, 0.06027981, -0.001...","{ ""name"": ""Quindío Trail Colombia"" }",0.710415
2,49,Quito Trail Ecuador,False,Ecuador,Quito,https://res.cloudinary.com/kavval/image/upload...,https://www.finishers.com/en/event/quito-trail...,True,False,True,...,,8.0,2026.0,,-0.220164,-78.512327,passage: Quito Trail Ecuador takes place in Qu...,"[-0.06586352, 0.018734813, 0.058759533, 0.0160...","{ ""name"": ""Quito Trail Ecuador"" }",0.706034
3,6,Paraty Brazil,False,Brazil,Paraty,https://res.cloudinary.com/kavval/image/upload...,https://www.finishers.com/en/event/paraty-braz...,False,False,False,...,,9.0,2025.0,,-23.141541,-44.76108,"passage: Paraty Brazil takes place in Paraty, ...","[-0.07758545, 0.054810338, 0.058911953, 0.0320...","{ ""name"": ""Paraty Brazil"" }",0.705857
4,20,Hoka Chiang Mai Thaïland,False,Thailand,Chiang Mai,https://res.cloudinary.com/kavval/image/upload...,https://www.finishers.com/en/event/hoka-chiang...,False,True,False,...,,12.0,2025.0,,18.788278,98.98588,passage: Hoka Chiang Mai Thaïland takes place ...,"[-0.048644, 0.035381272, 0.034256756, 0.050821...","{ ""name"": ""Hoka Chiang Mai Thaïland"" }",0.704122


In [76]:
db.sql(f"""select * from UTMB where id == 5 """)


┌───────┬───────────────────────┬────────────────┬──────────┬───────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────────────────────────────┬────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬────────────────┬───────────────┬────────────────┬────────────────────────────┬───────────────┬───────────────┬──────────────────────────────┬───────────

In [55]:
db.close()

## START with a text only RAG approach, we are embedding here so that we can try multiple models.

In [2]:
from sentence_transformers import SentenceTransformer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
torch.mps.empty_cache()

In [4]:
qwen_3_emb =HuggingFaceEmbeddings(model="Qwen/Qwen3-Embedding-0.6B", encode_kwargs={'batch_size': 8},show_progress=True)

In [5]:
text_df = pd.read_csv("../data_test/utmb_text.csv")

In [6]:
text_df['embeddings'] = qwen_3_emb.embed_documents(text_df['description'].to_list())

Batches: 100%|██████████| 4/4 [00:08<00:00,  2.03s/it]


In [7]:
text_df.head()

Unnamed: 0,id,name,distance,description,embeddings
0,1,Nice Côte d'Azur,20,On the trails along the Nice coastline\nEmbark...,"[0.04840404540300369, 0.05893983691930771, -0...."
1,2,Nice Côte d'Azur,55,An Azure epic\nStarting at the mythical Col d'...,"[0.06165490671992302, 0.06873076409101486, -0...."
2,3,Nice Côte d'Azur,105,The discovery of Nice County\nFrom the mountai...,"[-0.04237787425518036, 0.04705025255680084, -0..."
3,4,Nice Côte d'Azur,160,From the Mercantour to the French Riviera\nAn ...,"[-0.03055475279688835, 0.06577242165803909, -0..."
4,5,Kaçkar,20,KAÇKAR 20K\nYou’ll find yourself running throu...,"[0.03057355247437954, 0.0002405015693511814, -..."


In [8]:
query = "What race is along the sea?"

In [35]:
query_emb = qwen_3_emb.embed_query(query)

In [9]:
from langchain_chroma import Chroma

In [13]:
vector_store = Chroma(
    collection_name="UTMB_text",
    embedding_function=qwen_3_emb,
    persist_directory="../data_test/chroma_utmb_db", 
)

In [24]:
from langchain_community.document_loaders.csv_loader import CSVLoader


loader = CSVLoader(file_path='../data_test/utmb_text.csv')
data = loader.load()

In [25]:
data

[Document(metadata={'source': '../data_test/utmb_text.csv', 'row': 0}, page_content="id: 1\nname: Nice Côte d'Azur\ndistance: 20\ndescription: On the trails along the Nice coastline\nEmbark on an adventure of 22km with 700m elevation gain through a course between land and sea, starting from the port of Saint-Jean-Cap-Ferrat, along the trails of the French Riviera.\n\nSet off on a nearly complete tour of the peninsula, starting from the port of Saint-Jean-Cap-Ferrat, before climbing towards the Chapelle St Grat. From there, head towards the Plateau St Michel, offering a breathtaking view towards Cap Ferrat and then to Col 4 Chemins. After a beautiful passage along the Mediterranean trails, you'll descend back towards the coast. A final short climb up the Château hill will reward you with a view of the Bay of Angels before you cross the finish line on the famous Promenade des Anglais.\n\nThis semi-urban course features 50% of its route along the seaside, promising you an extraordinary ex

In [17]:
_ = vector_store.add_documents(documents=data)

Batches: 100%|██████████| 4/4 [00:33<00:00,  8.41s/it]


In [18]:
a = vector_store.similarity_search(query, k=5)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]


In [19]:
a

[Document(id='9ca98526-8145-4cd6-9c93-725cfc5d6702', metadata={'source': '../data_test/utmb_text.csv', 'row': 3}, page_content="id: 4\nname: Nice Côte d'Azur\ndistance: 160\ndescription: From the Mercantour to the French Riviera\nAn iconic distance, the 100-mile race starts from Auron and quickly reaches the highest point of the course at more than 2,650 meters above sea level: the Rabuons and its unique landscapes along the Chemin de l’Energie. After winding through chestnut forests and following the meandering Tinée river, you will walk along the wonders of the Mercantour massif and then cross beautiful ridges at the halfway point at more than 2,000m above sea level. Then, head south through more Mediterranean landscapes. A last effort to cross the 4 mounts: Mont Leuze, Mont Vinaigrier, Mont Alban and Mont Boron, and at the bend of a steep staircase, you will find yourself by the sea, a magical landscape so different from anything you have experienced. The final stretch along the coa

In [26]:
vector_store.similarity_search_with_score("what race offers the longest distance ?", k=5)

Batches: 100%|██████████| 1/1 [00:02<00:00,  2.16s/it]


[(Document(id='8235d36f-d511-4119-bdbd-4a4d74d702b9', metadata={'row': 17, 'source': '../data_test/utmb_text.csv'}, page_content='id: 18\nname: UTMB®\ndistance: 175\ndescription: THE ULTIMATE REFERENCE IN TRAIL RUNNING\nCreated in 2003, the UTMB is "the most mythical and prestigious trail running race in the world" with approximately 170 kilometers and 10,000 meters of positive elevation gain around the Mont-Blanc through Italy, Switzerland and France. A race like no other, it transcends the sport, and has established itself as an iconic trail race, renowned worldwide.\n\nNo matter the time it takes to finish, whether done in under 20 hours or more than 46, the elite runners as well as amateur runners share the same adventure on the same exceptional course, carried by the stunning beauty of the landscapes and a common goal: to cross the finish line in Chamonix. Much more than a race, it is an introspective adventure which transforms every person that takes on the challenge.'),
  0.9889

In [29]:
! pip install -qU "langchain[google-genai]"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [53]:

from ast import List
from langchain_google_genai import ChatGoogleGenerativeAI
import getpass
import os
from dotenv import load_dotenv
load_dotenv()


llm = ChatGoogleGenerativeAI(model='gemma-3n-e2b-it',api_key=os.environ.get("GOOGLE_GENAI_API_KEY"))


In [55]:
llm.invoke("Write me a Hayku about LangChain")

AIMessage(content='Chains of thought ignite,\nLLMs now work as a team,\nKnowledge flows so free. ', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemma-3n-e2b-it', 'safety_ratings': []}, id='run--5a0a9e09-5f85-4197-bfcf-f8038c05d5e7-0', usage_metadata={'input_tokens': 9, 'output_tokens': 21, 'total_tokens': 30, 'input_token_details': {'cache_read': 0}})

In [68]:
from langchain_core.prompts import ChatPromptTemplate


In [69]:
prompt= ChatPromptTemplate.from_messages(
            [
                
                ("human",""" You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
                Question: {question} 
                Context: {context} 
                Answer:"""),
            ]
        )

In [70]:
from typing import TypedDict, List
from langchain_core.documents import Document
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

In [71]:
# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

In [72]:
def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [73]:
from langgraph.graph import START, StateGraph

In [74]:
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
response = graph.invoke({"question": "What is race is close to the Sea? What distances does it offer?"})
print(response["answer"])

Batches: 100%|██████████| 1/1 [00:08<00:00,  8.80s/it]


Race is close to the Sea is 160km. The Nice Côte d'Azur race offers a 100-mile race that ends on the Promenade des Anglais. Another race, the "Azure epic", finishes at the foot of Mont Leuze, with a view overlooking the harbour of Villefranche/mer and Cap Ferrat.


In [78]:
response

{'question': 'What is race is close to the Sea? What distances does it offer?',
 'context': [Document(id='9ca98526-8145-4cd6-9c93-725cfc5d6702', metadata={'row': 3, 'source': '../data_test/utmb_text.csv'}, page_content="id: 4\nname: Nice Côte d'Azur\ndistance: 160\ndescription: From the Mercantour to the French Riviera\nAn iconic distance, the 100-mile race starts from Auron and quickly reaches the highest point of the course at more than 2,650 meters above sea level: the Rabuons and its unique landscapes along the Chemin de l’Energie. After winding through chestnut forests and following the meandering Tinée river, you will walk along the wonders of the Mercantour massif and then cross beautiful ridges at the halfway point at more than 2,000m above sea level. Then, head south through more Mediterranean landscapes. A last effort to cross the 4 mounts: Mont Leuze, Mont Vinaigrier, Mont Alban and Mont Boron, and at the bend of a steep staircase, you will find yourself by the sea, a magica