# Import packages

In [None]:
import os
from dotenv import load_dotenv
import json
import pandas as pd

import pinecone
import cohere

load_dotenv()

# Load dataset

In [None]:
# Load laws
law_df = pd.read_excel("./dataset/propisi_net/laws.xlsx")
# Load articles
article_df = pd.read_excel("./dataset/propisi_net/articles.xlsx")
# Load units
unit_df = pd.read_excel("./dataset/propisi_net/units.xlsx")
# Load items
items_df = pd.read_excel("./dataset/propisi_net/items.xlsx")

# Save files in CSV foramt
law_df.to_csv("./dataset/propisi_net/laws.csv", index=None, encoding='utf-8-sig')
article_df.to_csv("./dataset/propisi_net/articles.csv", index=None, encoding='utf-8-sig')
unit_df.to_csv("./dataset/propisi_net/units.csv", index=None, encoding='utf-8-sig')

In [None]:
# Remove rows from items DF where start column is empty
items_wo_na_df = items_df[~items_df.start.isna()].copy()
# Add Length column to items DF
items_wo_na_df["length"] = items_wo_na_df.end - items_wo_na_df.start

# Remove items with length greater than 2000
items_for_db = items_wo_na_df[items_wo_na_df.length < 2000][["id", "law_id", "article_id", "start", "end", "reference"]].copy()
# Convert start and end columns to int
items_for_db.start = items_for_db.start.astype(int)
items_for_db.end = items_for_db.end.astype(int)
# Save items in CSV format
items_for_db.to_csv("./dataset/propisi_net/items.csv", index=None, encoding='utf-8-sig')

# Create logic to add references into text

In [None]:
def add_references(cohere_text, references, article, unit_df):
    curr_offset = 0
    item_parts = []
    for reference in references:
        unit_text = ""
        for unit_id in reference["unit_ids"]:
            unit = unit_df[unit_df.id == unit_id].iloc[0]
            unit_text = unit_text + article.text[unit.start:unit.end] + "\n"
        
        cohere_text_with_ref = cohere_text[curr_offset:reference["end"]] + f" (koji glasi: {unit_text})"
        curr_offset = reference["end"]
        item_parts.append(cohere_text_with_ref)
    item_parts.append(cohere_text[curr_offset:])

    cohere_text_with_ref = ' '.join(item_parts)
    return cohere_text_with_ref

# Create list of items for vector database (pinecone)

In [None]:
counter = 0
item_ids, vector_items, article_ids = [], [], []
for item_ind, item_row in items_for_db.iterrows():
    article = article_df[article_df.id == item_row.article_id].iloc[0]
    cohere_text = article.text[item_row.start:item_row.end]
    article_units = unit_df[unit_df.article_id == item_row.article_id].copy()
    references = json.loads(item_row.reference)
    # TODO: Add Law and Artivle Title here
    if len(references):
        cohere_text = add_references(cohere_text, references, article, unit_df)

    item_ids.append(item_row.id)
    vector_items.append(cohere_text)
    article_ids.append(article.id)

    counter += 1

    if counter > 20:
        break

# Establish connection with pinecone and cohere services

In [None]:
pinecone.init(api_key=os.environ["PINECONE_KEY"], environment=os.environ["PINECONE_ENV"])
active_indexes = pinecone.list_indexes()

co = cohere.Client(os.environ["COHERE_KEY"])
# Transform text into embedings
vector_item_values = co.embed(
    texts=vector_items,
    model="embed-multilingual-v3.0",
    input_type="search_document"
).embeddings

# Send cohere vectors to database

In [None]:
pinecone_vectors = []
for item_id, vector_item_value, article_id in zip(item_ids, vector_item_values, article_ids):

    pinecone_vectors.append({
        'id': str(item_id), 
        'values': vector_item_value, 
        'metadata':{'article_id': int(article_id)}
    })

index = pinecone.Index('items') 
upsert_response = index.upsert(
    vectors=pinecone_vectors,
    namespace=os.environ["PINECONE_NAMESPACE"]
)