# Create the embeddings from the voting behavior data

...

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
UPLOAD_TO_PROD = False

### Create the index if it doesn't exist

In [None]:
# Load env file
import os
from dotenv import load_dotenv
if not UPLOAD_TO_PROD:
    load_dotenv(override=True)
else:
    load_dotenv('../../.env.prod', override=True)
print(f"ENV={os.getenv('ENV')}")

In [None]:
# Define variables
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
import os
import time

embed = OpenAIEmbeddings(
    model="text-embedding-3-large", api_key=os.getenv("OPENAI_API_KEY")
)
EMBEDDING_SIZE = 3072  
INDEX_NAME = "justified-voting-behavior-index"
pc = Pinecone(pinecone_api_key=os.getenv("PINECONE_API_KEY"), embedding=embed)

In [5]:
def create_index_if_not_exists(name_prefix: str):
    index_name = f"{name_prefix}-index"
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        print(f"Creating index {index_name}")
        pc.create_index(
            name=index_name,
            dimension=EMBEDDING_SIZE,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="eu-west-1"),
        )
        while not pc.describe_index(index_name).status["ready"]:
            time.sleep(1)
        print(f"Index {index_name} created")


create_index_if_not_exists("justified-voting-behavior")

### Vectorize the data and insert into the index

In [6]:
from uuid import uuid4
from langchain.schema import Document
from pinecone import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import os
import json
import locale

from pydantic import ValidationError

from data.scripts.script_utils import create_vote_metadata_for_pinecone, ensure_uniform_vote_object_data
from src.models.vote import Vote

locale.setlocale(locale.LC_TIME, "de_DE")

EMBEDDING_MODEL = "text-embedding-3-large"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


def create_vote_document(vote_id: int) -> Document:
    vote_file_path = f"../votes/vote_{vote_id}.json"

    try:
        # Read the vote JSON file
        with open(vote_file_path, 'r', encoding="utf-8") as f:
            vote_data = json.load(f)
    except Exception as e:
        print(f"Skipping vote {vote_id} due to file not found: {e}")
        return None

    try:
        vote_obj = Vote(**vote_data)
    except ValidationError as e:
        print(f"Skipping vote {vote_id} due to data validation error: {e}")
        return None
    
    ensure_uniform_vote_object_data(vote_obj)

    if vote_obj.vote_category == "B":
        print(f"Skipping vote {vote_id} due to vote category B")
        return None
    
    # Create metadata for the document
    metadata = create_vote_metadata_for_pinecone(vote_obj)
    # Check size of metadata in bytes
    metadata_size = len(json.dumps(metadata, ensure_ascii=False).encode('utf-8'))
    size_limit = 40960
    if metadata_size > size_limit :
        print(f"Skipping vote {vote_id} due to metadata size limit: size of metadata={metadata_size} bytes; limit={size_limit} bytes")
        return None
    
    # Create document from vote detail
    uuid = uuid4()
    doc = Document(
        id=f"#{vote_obj.id}#{uuid}",
        page_content=vote_obj.short_description,
        metadata=metadata,
    )
    return doc

In [None]:
documents_to_add = []
start_id = 377
end_id = 940
# end_id = 378 # for testing purposes, set to 940 to process all votes
skipped_docs = 0
for vote_id in range(start_id, end_id):
    doc = create_vote_document(vote_id)
    if doc:
        documents_to_add.append(doc)
    else:
        skipped_docs += 1
print(f"Skipped {skipped_docs} documents")
index = pc.Index(INDEX_NAME)
embed = OpenAIEmbeddings(model=EMBEDDING_MODEL, api_key=OPENAI_API_KEY)
vector_store = PineconeVectorStore(index=index, embedding=embed)
print(f"Adding {len(documents_to_add)} documents to the index")
vector_store.add_documents(documents_to_add, namespace="vote_summary")
print("Documents added to the index")

## Load relevant documents from the index

In [20]:
embed = OpenAIEmbeddings(
    model="text-embedding-3-large", api_key=os.getenv("OPENAI_API_KEY")
)

pc = Pinecone(pinecone_api_key=os.getenv("PINECONE_API_KEY"), embedding=embed)
index = pc.Index(INDEX_NAME)
pinecone_vector_store = PineconeVectorStore(index=index, embedding=embed)

def query_voting_behavior(
    rag_query: str,
    n_docs: int = 10,
    score_threshold: float = 0.5,
) -> list[Document]:
    relevant_docs_with_scores = (
        pinecone_vector_store.similarity_search_with_relevance_scores(
            rag_query,
            namespace="vote_summary",
            k=n_docs,
            score_threshold=score_threshold,
        )
    )
    relevant_docs = [doc for doc, _ in relevant_docs_with_scores]
    return relevant_docs

## Query the index

In [None]:
query = """Die AfD plant, Menschen mit geringem Einkommen unter anderem durch **Anhebung des Sparerpauschbetrags auf 2.400 Euro** zu entlasten, was den privaten Vermögensaufbau erleichtern soll [0]. Zudem sieht die AfD eine **Absenkung der Mehrwertsteuer für Artikel des Kinderbedarfs auf 7 %** vor, was finanziell entlasten kann [2]. Auch das steuerliche **Familiensplitting** soll insbesondere Familien mit mittlerem Einkommen entlasten [2]."""

for result in query_voting_behavior(query):
    print(result.metadata)
    print(result.page_content)
    print("----------------------------------")

