In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [4]:
from loguru import logger
from langchain.callbacks import FileCallbackHandler
from datetime import date

today = date.today()

logfile = f"logs/full_pipeline_{today.year}_{today.month}_{today.day}.log"
logger.add(logfile, format="{message}", colorize=False, enqueue=True)
handler = FileCallbackHandler(logfile)

In [16]:
from lib import create_cube_selection_chain, fetch_cubes_descriptions, parse_all_cubes, fetch_cube_sample, fetch_dimensions_triplets

cube_selection_settings = {
    "temperature": 0.5,
    "top_p": 0.5
}
cubes = fetch_cubes_descriptions()
#dimensions = fetch_dimensions()
cube_selection_chain = create_cube_selection_chain(api_key=OPENAI_API_KEY, handler=handler, **cube_selection_settings)

questions = [
"sum of emission of CO2 for industry between year 2009 and 2011",
"get average of emission of Methane for transport between years 2007 and 2005",
"What percentage of emission was from N2O and CH4 compared to total emission?",
"what bathing stations are in switzerland?",
"what swiss bathing stations had poor water quality in 2024?",
"what is maximum contamination in lead in soil in 2022?",
"what's for lunch"]

question=questions[-2]


cube_selection_response = await cube_selection_chain.ainvoke({
    "cubes": cubes,
    "question": question,
})
cube_selection_response = cube_selection_response['text']

logger.info("========== CUBES RESPONSE ================")
logger.info(f"{cube_selection_response}")

                    top_p was transferred to model_kwargs.
                    Please confirm that top_p is what you intended.
[32m2025-01-15 17:39:05.581[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mThe cube that best answers the question about maximum contamination in lead in soil in 2022 is:

**Cube ID:** <https://environment.ld.admin.ch/foen/ubd006601/3>

**Justification:** This cube is titled "Heavy Metal Soil Contamination" and contains measurement data of heavy metal concentrations, which would include lead, recorded in the top 20 cm of the soil. While the specific year 2022 is not mentioned, it is reasonable to assume that the data collected in this cube would provide relevant information regarding lead contamination in soil.

**Available Topics in the Cubes:**
- Forest fire danger
- Forest fire prevention measures
- NFI: Change
- Traffic noise pollution
- Air pollution along the A2 and A13 - annual averages - NO and NO2
- Bathing water qualit

In [14]:
question

'what is average contamination in plomb in soil in 2022?'

In [6]:
selected_cubes = parse_all_cubes(cube_selection_response)
selected_cube = selected_cubes[0]

logger.info("========== SELECTED FIRST CUBE ================")
logger.info(selected_cube)

IndexError: list index out of range

In [15]:
cube_and_sample = fetch_cube_sample(selected_cube)

# print(cube_and_sample)

In [16]:
dimensions_triplets = fetch_dimensions_triplets(selected_cube)

# print(dimensions_triplets)

In [None]:
from lib import create_query_generation_chain

query_generation_settings = {
    "temperature": 0.2,
    "top_p": 0.1
}

generation_chain = create_query_generation_chain(api_key=OPENAI_API_KEY, handler=handler, **query_generation_settings)

query_generation_response = await generation_chain.ainvoke({
    "cube_and_sample": cube_and_sample,
    "dimensions_triplets": dimensions_triplets,
    "cube": selected_cube,
    "question": question,
})
query_generation_response = query_generation_response['text']

logger.info("========== QUERY GENERATION RESPONSE ================")
logger.info(f"{query_generation_response}")

                    top_p was transferred to model_kwargs.
                    Please confirm that top_p is what you intended.
[32m2025-01-15 17:24:32.906[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [1mPREFIX cube: <https://cube.link/>
PREFIX schema: <http://schema.org/>
PREFIX qudt: <http://qudt.org/schema/qudt/>
PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT (SUM(?emission) AS ?totalEmission)
WHERE {
    <https://environment.ld.admin.ch/foen/ubd000504/8> a cube:Cube;
        cube:observationSet ?observationSet.

    ?observationSet a cube:ObservationSet;
        cube:observation ?observation.

    ?observation a cube:Observation;
        <https://environment.ld.admin.ch/foen/ubd000504/jahr> ?year;
        <https://environment.ld.admin.ch/foen/ubd000504/werte> ?emission.

    FILTER(?year >= "2

In [None]:
from lib import run_query

result = run_query(query_generation_response)

logger.info("=========== QUERY RESULT ============")
logger.info(result)

[32m2025-01-15 17:24:33.150[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1m{'head': {'vars': ['totalEmission']}, 'results': {'bindings': [{'totalEmission': {'datatype': 'http://www.w3.org/2001/XMLSchema#decimal', 'type': 'literal', 'value': '112.144'}}]}}[0m
