In [None]:
ID_INDEX = 0
ACRONYM_INDEX = 1
STATUS_INDEX = 2
TITLE_INDEX = 3
START_DATE_INDEX = 4
END_DATE_INDEX = 5
TOTAL_COST_INDEX = 6
EC_MAX_CONTRIBUTION_INDEX = 7
LEGAL_BASIS_INDEX = 8
TOPICS_INDEX = 9
EC_SIGNATURE_DATE_INDEX = 10
FRAMEWORK_PROGRAMME_INDEX = 11
MASTER_CALL_INDEX = 12
SUB_CALL_INDEX = 13
FUNDING_SCHEME_INDEX = 14
NATURE_INDEX = 15
OBJECTIVE_INDEX = 16
CONTENT_UPDATE_DATE_INDEX = 17
RCN_INDEX = 18
GRANT_DOI_INDEX = 19

In [None]:
import csv

with open('data/H2020_projects.csv') as file:
    lines = csv.reader(file, delimiter=";")
    
    documents = []
    metadatas = []
    ids = []
    
    for i, line in enumerate(lines):
        if i== 0:
            # Skip the first row (the column headers)
            continue
        
        document = f"{line[TITLE_INDEX]}\n{line[OBJECTIVE_INDEX]}"
        metadata = {
            "item_id": line[ID_INDEX],
            "acronym": line[ACRONYM_INDEX],
            "status": line[STATUS_INDEX],
            "title": line[TITLE_INDEX],
            "start_date": line[START_DATE_INDEX],
            "end_date": line[END_DATE_INDEX],
            "total_cost": line[TOTAL_COST_INDEX],
            "ec_max_contribution": line[EC_MAX_CONTRIBUTION_INDEX],
            "legal_basis": line[LEGAL_BASIS_INDEX],
            "topics": line[TOPICS_INDEX],
            "ec_signature_date": line[EC_SIGNATURE_DATE_INDEX],
            "framework_programme": line[FRAMEWORK_PROGRAMME_INDEX],
            "master_call": line[MASTER_CALL_INDEX],
            "sub_call": line[SUB_CALL_INDEX],
            "funding_scheme": line[FUNDING_SCHEME_INDEX],
            "nature": line[NATURE_INDEX],
            "objective": line[OBJECTIVE_INDEX],
            "content_update_date": line[CONTENT_UPDATE_DATE_INDEX],
            "rcn": line[RCN_INDEX],
            "grant_doi": line[GRANT_DOI_INDEX]
        }

        documents.append(document)
        metadatas.append(metadata)
        ids.append(line[ID_INDEX])
        
print(len(documents))
print(len(metadatas))
print(len(ids))

In [None]:
# Install chromadb
%pip install chromadb

# Install sentence transformers
# This is used to convert text to vector embeddings. In other words, it converts text to a bunch of numbers that represent the 'meaning' of the text.
%pip install -U sentence-transformers

In [None]:
# Reference: https://docs.trychroma.com/getting-started

import chromadb
from chromadb.utils import embedding_functions

# Instantiate chromadb instance. Data is stored in memory only.
# chroma_client = chromadb.Client()

# Instantiate chromadb instance. Data is stored on disk (a folder named 'projects_vectordb' will be created in the same folder as this file).
chroma_client = chromadb.PersistentClient(path="../database/projects_vectordb")

In [None]:
# Select the embedding model to use.
# List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# Use this to delete the database
# chroma_client.delete_collection(name="projects_collection")

# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
collection = chroma_client.get_or_create_collection(name="projects_collection", embedding_function=sentence_transformer_ef)

In [None]:
# Add all the data to the vector database. ChromaDB automatically converts and stores the text as vector embeddings.
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

In [None]:
# Query the vector database
results = collection.query(
    query_texts=["Digital Transformation for Sustainable and Resilient Food Systems"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)