# Store requests of parties in the german parlament in a vector store

In [1]:
UPLOAD_TO_PROD = False

In [None]:
# Load env file
import os
from dotenv import load_dotenv

if not UPLOAD_TO_PROD:
    load_dotenv(override=True)
else:
    load_dotenv("../../.env.prod", override=True)
print(f"ENV={os.getenv('ENV')}")

In [None]:
# Define variables
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
import os
import time

embed = OpenAIEmbeddings(
    model="text-embedding-3-large", api_key=os.getenv("OPENAI_API_KEY")
)
EMBEDDING_SIZE = 3072
INDEX_NAME = "parliamentary-questions-index"
pc = Pinecone(pinecone_api_key=os.getenv("PINECONE_API_KEY"), embedding=embed)

In [None]:
def create_index_if_not_exists(index_name: str):
    index_name = f"{index_name}"
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        print(f"Creating index {index_name}")
        pc.create_index(
            name=index_name,
            dimension=EMBEDDING_SIZE,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="eu-west-1"),
        )
        while not pc.describe_index(index_name).status["ready"]:
            time.sleep(1)
        print(f"Index {index_name} created")
    else:
        print(f"Index {index_name} already exists")


create_index_if_not_exists(INDEX_NAME)

In [5]:
from typing import Optional
from uuid import uuid4
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
import os
import json
import locale

from pydantic import ValidationError

from data.scripts.script_utils import (
    create_vote_metadata_for_pinecone,
    ensure_uniform_vote_object_data,
)
from src.models.vote import Vote

locale.setlocale(locale.LC_TIME, "de_DE")


EMBEDDING_MODEL = "text-embedding-3-large"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


def create_parliamentary_question_documents_for_namespaces(
    vote_id: int,
) -> Optional[dict[str, Document]]:
    vote_file_path = f"../votes/vote_{vote_id}.json"

    # Read the vote JSON file
    with open(vote_file_path, "r", encoding="utf-8") as f:
        vote_data = json.load(f)

    # Rename "voting_results" key to "voting_results"
    vote_data["voting_results"] = vote_data.pop("voting_results")
    # Rename "by_party" in "voting_results" to "by_party"
    vote_data["voting_results"]["by_party"] = vote_data["voting_results"].pop(
        "by_party"
    )

    try:
        vote_obj = Vote(**vote_data)
    except ValidationError as e:
        print(f"Skipping vote {vote_id} due to data validation error: {e}")
        return None

    ensure_uniform_vote_object_data(vote_obj)

    if vote_obj.vote_category not in ["SA", "AA", "EA"]:
        print(f"Skipping vote {vote_id} due to request type {vote_obj.vote_category}")
        return None

    if vote_obj.submitting_parties is None or len(vote_obj.submitting_parties) == 0:
        print(f"Skipping vote {vote_id} due to missing submitting parties")
        return None

    namespace_to_document = {}
    for party in vote_obj.submitting_parties:
        namespace = f"{party}-parliamentary-questions"

        # Create metadata for the document
        metadata = create_vote_metadata_for_pinecone(vote_obj)

        # Check size of metadata in bytes
        metadata_size = len(json.dumps(metadata, ensure_ascii=False).encode("utf-8"))
        size_limit = 40960
        if metadata_size > size_limit:
            print(
                f"Skipping vote {vote_id} due to metadata size limit: size of metadata={metadata_size} bytes; limit={size_limit} bytes"
            )
            return None

        # Create document from vote detail
        uuid = uuid4()
        doc = Document(
            id=f"#{vote_obj.id}#{uuid}",
            page_content=vote_data["short_description"],
            metadata=metadata,
        )

        namespace_to_document[namespace] = doc
    return namespace_to_document

In [None]:
namespace_to_documents: dict[str, list[Document]] = {}
start_id = 377
end_id = 940
# end_id = 378 # for testing purposes, set to 940 to process all votes
skipped_docs = 0
for vote_id in range(start_id, end_id):
    namespace_to_document = create_parliamentary_question_documents_for_namespaces(
        vote_id
    )
    if namespace_to_document is not None:
        for namespace, doc in namespace_to_document.items():
            if namespace not in namespace_to_documents:
                namespace_to_documents[namespace] = []
            namespace_to_documents[namespace].append(doc)
    else:
        skipped_docs += 1
print(f"Skipped {skipped_docs} documents")
index = pc.Index(INDEX_NAME)
embed = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=OPENAI_API_KEY)
vector_store = PineconeVectorStore(index=index, embedding=embed)
for namespace, documents in namespace_to_documents.items():
    print(f"Adding {len(documents)} documents into namespace {namespace}")
    vector_store.add_documents(documents, namespace=namespace)
    print(f"Added {len(documents)} documents into namespace {namespace}")