In [9]:
from langchain_huggingface import HuggingFaceEmbeddings
import duckdb
import pandas as pd
import tqdm as notebook_tqdm
import os,sys


In [2]:
def get_offered_X(row: pd.Series, prefix: str, prefix_sep: str = "_") -> str:
    cols_name = prefix + prefix_sep
    if prefix_sep in prefix:
        cols_name = prefix
    # Filter columns that start with 'X'
    distance_columns = [col for col in row.index if col.startswith(cols_name)]

    # Extract columns where the flag is True
    offered_distances = [
        col.replace(cols_name, "") for col in distance_columns if row[col]
    ]

    # Join the distances into a comma-separated string
    return ", ".join(offered_distances)

In [3]:
def load_data_to_db(data: pd.DataFrame) -> None:
    """
    load the dataFrame to a duckdb instance
    """
    emebeddings_size = len(data["embeddings"].iloc[0])
    print(f"Embedding size: {emebeddings_size}")
    conn = duckdb.connect("../data_test/utmb_db.duckdb")
    duck_tables = conn.sql("show all tables").df()
    if "UTMB" in duck_tables["name"].values:
        conn.sql("DROP TABLE UTMB")
    conn.sql("""
             INSTALL vss;
             LOAD vss;
             set hnsw_enable_experimental_persistence = true;
             CREATE TABLE UTMB AS SELECT * EXCLUDE (embeddings),
    CAST(embeddings AS FLOAT[384]) AS embeddings FROM data;""")
    conn.sql("""LOAD vss; CREATE INDEX cos_idx ON UTMB USING HNSW (embeddings)
                WITH (metric = 'cosine');""")
    return print("data successfully saved to duckDB")

In [10]:
model_name = 'intfloat/e5-small-v2'
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"})
db = duckdb.connect("../data_test/utmb_db.duckdb")

In [11]:
db.sql("show all tables").df()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,utmb_db,main,UTMB,"[name, date_confirmed, country, city, image, l...","[VARCHAR, BOOLEAN, VARCHAR, VARCHAR, VARCHAR, ...",False


In [12]:
data = db.sql("SELECT * FROM UTMB").df()

In [15]:
question = "Which race offers the longest distance run?"
embedded_question = embeddings.embed_query(question)


In [18]:
db.sql(f"""SELECT description from UTMB
ORDER BY array_distance(embeddings, cast({embedded_question} as FLOAT[384])) DESC
LIMIT 3;""")

┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│                                                                                                                                                                                       description                                                                                                                                                                                       │
│                                                                                                                                                                                         varchar                                               

In [99]:
db.sql("""SELECT * FROM UTMB limit 2""")

┌───────────────────┬────────────────┬─────────────────────┬──────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────────────────────────────────────────────────┬────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬────────────────┬───────────────┬────────────────┬────────────────────────────┬───────────────┬───────────────┬────────────────────

In [100]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 84 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   name                          49 non-null     object 
 1   date_confirmed                49 non-null     bool   
 2   country                       49 non-null     object 
 3   city                          49 non-null     object 
 4   image                         49 non-null     object 
 5   link                          49 non-null     object 
 6   distance_5                    49 non-null     bool   
 7   distance_10                   49 non-null     bool   
 8   distance_15                   49 non-null     bool   
 9   distance_20                   49 non-null     bool   
 10  distance_25                   49 non-null     bool   
 11  distance_30                   49 non-null     bool   
 12  distance_35                   49 non-null     bool   
 13  distanc

In [101]:
data.name.iloc[0]

"Nice Côte d'Azur "

In [7]:
from torch import _chunk_cat


chunks: list = list(range(data.shape[0]))
print("Creating chunks of text data for RAG readiness...")
print("length:", data.shape[0])
for i in range(data.shape[0]):
    disciplines = get_offered_X(row=data.iloc[i], prefix='discipline')
    distances = get_offered_X(row=data.iloc[i], prefix='distance')
    styles = get_offered_X(row=data.iloc[i], prefix='style')
    print(data.iloc[i].start_day, data.iloc[i].date_confirmed, data.iloc[0].name)
    chunk = f"""{data.name.iloc[i]} takes place in {data.city.iloc[i]}, {data.country.iloc[i]} on {f'{int(data.start_day.iloc[i])}/' if data.date_confirmed.iloc[i] ==True else ''}{int(data.month.iloc[i])}/{int(data.year.iloc[i])}.
            The different distances offered are {distances} km, the disciplines are {disciplines} and the styles are {styles}.
            The event is {'multidays' if data.multidays.iloc[i] else 'single day'} and lasts {data.duration.iloc[i]} {'days' if data.multidays.iloc[i] else 'day'}."""
    print(chunk)
    chunks[i] = chunk


NameError: name 'data' is not defined

In [103]:
data["description"] = chunks
data['description']

0     Nice Côte d'Azur  takes place in Nice(06), Fra...
1     Kaçkar  takes place in Ayder, Republic of Türk...
2     KAT100 Austria  takes place in Kitzbühel, Aust...
3     UTMB® takes place in Chamonix(74), France on 2...
4     Julian Alps Trail Run  takes place in Kranjska...
5     Paraty Brazil  takes place in Paraty, Brazil o...
6      Grindstone Running Festival  takes place in M...
7     Wildstrubel by UTMB ® takes place in Crans-Mon...
8     Ultra Trail Whistler  takes place in Whistler,...
9     Chihuahua Mexico  takes place in Batopilas, Me...
10    Kodiak Ultra Marathons  takes place in Big Bea...
11    TransJeju  takes place in Jeju City, South Kor...
12    Ultra-Trail Ninghai  takes place in Ninghai, C...
13    Mallorca  takes place in Soller, Spain on 31/1...
14    Kullamannen  takes place in Båstad, Sweden on ...
15    Puerto Vallarta México  takes place in Puerto ...
16    TransLantau  takes place in Hong-Kong, Hong Ko...
17    Malaysia Ulra Trail  takes place in Taipin

In [104]:
print("Embedding the chunks of text data...")
embeddings_model = HuggingFaceEmbeddings(model_name="intfloat/e5-small-v2", model_kwargs={'device': 'mps'},
encode_kwargs={'batch_size': 8})
print(embeddings_model)
embeddings = embeddings_model.embed_documents(chunks)
data["embeddings"] = embeddings

Embedding the chunks of text data...
model_name='intfloat/e5-small-v2' cache_folder=None model_kwargs={'device': 'mps'} encode_kwargs={'batch_size': 8} query_encode_kwargs={} multi_process=False show_progress=False


  return forward_call(*args, **kwargs)


In [106]:
duck_tables = db.sql("show all tables").df()
db.sql("""
            INSTALL vss;
            LOAD vss;
            set hnsw_enable_experimental_persistence = true;
            CREATE TABLE UTMB AS SELECT * EXCLUDE (embeddings),
CAST(embeddings AS FLOAT[384]) AS embeddings FROM data;""")
db.sql("""LOAD vss; CREATE INDEX cos_idx ON UTMB USING HNSW (embeddings)
            WITH (metric = 'cosine');""")

CatalogException: Catalog Error: Table with name "UTMB" already exists!

In [8]:
db.close()