In [None]:
from sentence_transformers import SentenceTransformer
import requests
import pandas as pd

from langchain_chroma import Chroma
import chromadb
from tqdm import tqdm

from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


In [None]:
CRISP_ENDPOINT = 'http://crisp.ai.wu.ac.at/blazegraph/namespace/crisp/sparql'
OLLAMA_ENDPOINT = 'http://llama-max-ollama.ai.wu.ac.at'
CRISP_NAMESPACE = 'http://crisp.ai.wu.ac.at/crisp/'

In [None]:
def sparql_query(query: str) -> pd.DataFrame:
    """
    Executes a SPARQL query on a pre-loaded RDF graph and returns the results as a DataFrame.
    """
    try:
        # Prepare and execute the query        
        # query = prepareQuery(query)
        # results = rdf_graph.query(query)

        response = requests.get(CRISP_ENDPOINT, params={'query': query, 'format': 'json'})
        results = response.json()

        
        # Extract variable (column) names from the query result
        columns = results['head']['vars']  # Get the variable names from the query results
        
        # Process the results and convert them into a list of dictionaries
        data = []
        for row in results['results']['bindings']:
            row_data = {str(var): row[var]['value'].replace(CRISP_NAMESPACE, "") for var in columns}  # Dynamically build a row dict
            # .replace(CRISP_NS, "")
            data.append(row_data)
        
        # Convert the data into a DataFrame
        df = pd.DataFrame(data, columns=[str(var) for var in columns])
        return df

    except Exception as e:
        print(f"An error occurred while executing the SPARQL query: {e}")
        return pd.DataFrame()

In [None]:

# A pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
model.encode("Why sky is blue?")

In [None]:
# Community information
community_query = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix crisp: <http://crisp.ai.wu.ac.at/crisp/>

SELECT ?community_id ?community_name ?state_name

WHERE {
  ?community_id a crisp:Community;
    rdfs:label ?community_name ; 
    crisp:locatedIn ?district_id .
  ?district_id crisp:locatedIn ?state_id .
  ?state_id rdfs:label ?state_name .      
} LIMIT 10
"""

# Observations of a community
def observation_query(community_id):
  return f"""
    prefix sosa: <http://www.w3.org/ns/sosa/> 
    prefix crisp: <http://crisp.ai.wu.ac.at/crisp/>

    SELECT ?observation_id ?property ?value

    WHERE {{
      ?observation_id sosa:hasFeatureOfInterest <http://crisp.ai.wu.ac.at/crisp/{community_id}> ;
          sosa:observedProperty ?property; 
          sosa:hasSimpleResult ?value . 
    }}
    """




In [None]:
# Initialize Chroma Persistent Client
persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection(name="graphrag_collection")

# Prepare data for Chroma collection
documents, metadatas, embeddings, ids = [], [], [], []


In [None]:

# Adding communities to Vector DB
community_df = sparql_query(community_query)
print(community_df)


# Using tqdm for progress tracking
print("Processing communities and adding to Chroma collection...")

communities = {}
for _, row in tqdm(community_df.iterrows(), total=len(community_df), desc="Communities Processed"):
    description = f"{row['community_name']}  is a community located in {row['state_name']} state in Austria"
    description_embedding = model.encode(description)
    documents.append(f"Name: {row['community_name']}, Type: 'Community', Description: {description}")
    metadatas.append({
        'subject': row['community_id'],
        'name': str(row['community_name']),
        'type': 'Community',
        'description': description
    })
    # embeddings.append([float(x) for x in row['description_embedding'].split()])
    embeddings.append(description_embedding)
    ids.append(str(row['community_id']))
    communities[row['community_id']] = row['community_name']


In [None]:
def observation_description(community_name, row):
    if row['property'] == "weeklyHeatdaysOver30":
        year, week = row['observation_id'].split('/')[-2:]
        return f"The {community_name} community experienced {row['value']} hot days during week {week} of {year}"
    else:
        year = row['observation_id'].split('/')[-1]
        return f"The population of the {community_name} community in {year} was {row['value']}"

In [None]:
# Adding community observations to Vector DB
for community_id in communities:
  observation_df = sparql_query(observation_query(community_id))

  # Using tqdm for progress tracking
  print("Processing observations and adding to Chroma collection...")

  for _, row in tqdm(observation_df.iterrows(), total=len(observation_df), desc="Observations Processed"):

      
      description = observation_description(communities[community_id], row)
      description_embedding = model.encode(description)
      documents.append(f"Type: 'Observation', Description: {description}")
      metadatas.append({
          'subject': row['observation_id'],
          'type': 'Observation',
          'description': description
      })
      embeddings.append(description_embedding)
      ids.append(str(row['observation_id']))



In [None]:

# Add the processed data to the Chroma collection
collection.add(
    documents=documents,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)
print("Chroma collection populated successfully.")

# Initialize vector store using Chroma
vector_store = Chroma(client=persistent_client, collection_name="graphrag_collection")

# Verify the count of entries in the collection
print(f"Total entries in the collection: {vector_store._collection.count()}")


# LLM Query

In [None]:
llm = ChatOllama(
    model = "llama3.1",
    temperature = 0.8,
    num_predict = 256,
    base_url = OLLAMA_ENDPOINT
    # other params ...
)

community_id, community_name = list(communities.items())[0]

question = f"What can you tell me about hot days in {community_name}?"

In [None]:
# Query without context
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that answers questions about community observations.",
        ),
        ("human", "{input}"),
    ]
)

chain = prompt | llm | StrOutputParser()
chain.invoke(
    {
        "input": question,
    }
)

In [None]:
# Calculate embedding of query
query_embedding_vector = model.encode(question)
print(query_embedding_vector.shape)

In [None]:
TOP_ENTITIES = 10
TOP_CHUNKS = 10
TOP_COMMUNITIES = 3
TOP_OUTGOING_RELATIONSHIPS = 10
TOP_INCOMING_RELATIONSHIPS = 10

In [None]:
results = vector_store.similarity_search_by_vector(
    embedding=query_embedding_vector, k=TOP_ENTITIES
)
entity_list = [doc.metadata['subject'] for doc in results]
descriptions = [doc.metadata['description'] for doc in results]

context = ". \n".join(descriptions)

In [None]:
# Query with context from embeddings
prompt = ChatPromptTemplate.from_messages(
    [
         (
            "system",
            f"You are a helpful assistant that answers questions about community observations. You have the following context: {context}",
        ),
       ("human", "{input}"),
    ]
)
chain = prompt | llm | StrOutputParser()
chain.invoke(
    {
        "input": question,
    }
)