In [1]:
# !pip install --upgrade boto3==1.39.8
import io
import os
import random
import nltk
import boto3
import requests
import json
from io import BytesIO
import tempfile
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows

S3_BUCKET = "ml-legal-restricted"
EXCEL_PATH = "full_contracts_with_files.xlsx"
VECTOR_BUCKET_NAME = "legal-docs-vector-store"
SOURCE_INDEX = 'token-chunking-poc'
TARGET_INDEX = 'token-chunking-metadata-enriched'

nltk.download('punkt')
nltk.download('punkt_tab')
s3 = boto3.client('s3')
s3v = boto3.client("s3vectors", region_name="us-east-1")

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [20]:
INDEX_NAME = 'token-chunking-metadata-enriched'

paginator = s3v.get_paginator('list_vectors')

total_vectors_count = 0

page_iterator = paginator.paginate(
    vectorBucketName=VECTOR_BUCKET_NAME,
    indexName=INDEX_NAME,
    returnData=True,
    returnMetadata=True,
    PaginationConfig={
        'PageSize': 1000  
    }
)

for page in page_iterator:
    vectors = page.get('vectors', [])
    total_vectors_count += len(vectors)

print(f"Total chunks vector & stored: {total_vectors_count}")


Total chunks vector & stored: 17598


In [2]:
csv_path = 'gathered_contract_files.csv'
csv_df = pd.read_csv(csv_path)

excel_df = pd.read_excel(EXCEL_PATH, sheet_name="Active Legal Contracts", dtype={"Parent Contract": str})

csv_df.columns = [col.strip().lower().replace(' ', '_') for col in csv_df.columns]
excel_df.columns = [col.strip().lower().replace(' ', '_') for col in excel_df.columns]

merged_df = pd.merge(csv_df, excel_df, on='contract_number', how='left')

def get_additional_metadata(s3_full_path, merged_df):

    row = merged_df[merged_df['s3_full_path'] == s3_full_path]
    if row.empty:
        print('its empty')
        return {}

    additional_metadata_keys = [
        'parent_contract', 'status_reason', 'solution_line', 'contract_title', 'contract_requester',
        'reviewing_attorney', 'created_on', 'document_effective_date'
    ]

    additional_metadata = row[additional_metadata_keys].iloc[0].dropna().to_dict()
    for k, v in additional_metadata.items():
        if pd.isna(v):
            additional_metadata[k] = None
        elif hasattr(v, 'strftime'):
            additional_metadata[k] = v.strftime('%Y-%m-%d')
        else:
            additional_metadata[k] = str(v)

    additional_metadata = {k: v for k, v in additional_metadata.items() if v is not None}
    return additional_metadata

enriched_metadata_list = []

for idx, row in csv_df.iterrows():
    s3_path = row['s3_full_path']
    additional_meta = get_additional_metadata(s3_path, merged_df)
    enriched_metadata_list.append(additional_meta)

enriched_meta_df = pd.DataFrame(enriched_metadata_list)

final_df = pd.concat([csv_df.reset_index(drop=True), enriched_meta_df], axis=1)

final_df.to_csv('gathered_contract_files_enriched.csv', index=False)

print("Enriched CSV saved to: gathered_contract_files_enriched.csv")


Enriched CSV saved to: gathered_contract_files_enriched.csv


In [19]:
def get_additional_metadata(s3_full_path, merged_df):
    try:
        row = merged_df[merged_df['s3_full_path'] == s3_full_path]
        if row.empty:
            print(f"No metadata found for s3_full_path: {s3_full_path}")
            return {}

        additional_metadata_keys = [
            'parent_contract', 'status_reason', 'solution_line', 'contract_title', 'contract_requester',
            'reviewing_attorney', 'created_on', 'document_effective_date'
        ]

        additional_metadata = row[additional_metadata_keys].iloc[0].to_dict()

        for k, v in additional_metadata.items():
            if pd.isna(v):
                additional_metadata[k] = None
            elif hasattr(v, 'strftime'):
                additional_metadata[k] = v.strftime('%Y-%m-%d')
            else:
                additional_metadata[k] = str(v)

        return {k: v for k, v in additional_metadata.items() if v is not None}

    except Exception as e:
        print(f"Error in get_additional_metadata for {s3_full_path}: {e}")
        return {}


def list_vectors_batch(index_name, bucket_name, max_results=500, next_token=None):
    try:
        params = {
            'indexName': index_name,
            'vectorBucketName': bucket_name,
            'maxResults': max_results,
            'returnData': True,
            'returnMetadata': True
        }
        if next_token:
            params['nextToken'] = next_token

        response = s3v.list_vectors(**params)
        return response.get('vectors', []), response.get('nextToken')

    except Exception as e:
        print(f"Error in list_vectors_batch: {e}")
        return [], None


def insert_vectors_batch(index_name, bucket_name, vectors):
    try:
        entries = [{
            'key': v['key'],
            'data': v['data'],
            'metadata': v['metadata']
        } for v in vectors]

        response = s3v.put_vectors(
            indexName=index_name,
            vectorBucketName=bucket_name,
            vectors=entries
        )
        return response

    except Exception as e:
        print(f"Error in insert_vectors_batch: {e}")
        return None


def migrate_vectors_with_metadata_update(source_index, target_index, bucket_name, df, batch_size=500):
    next_token = None

    while True:
        vectors, next_token = list_vectors_batch(source_index, bucket_name, batch_size, next_token)
        if not vectors:
            print("No vectors found in this batch. Ending migration.")
            break

        for vec in vectors:
            try:
                if 'metadata' not in vec or vec['metadata'] is None:
                    vec['metadata'] = {}

                s3_path = vec['metadata'].get('s3_path', '').strip()
                
                if s3_path:
                    dynamic_metadata = get_additional_metadata(s3_path, df)
                    vec['metadata'].update(dynamic_metadata)

                    for key in ['file_name', 'contract_number', 'opensearch', 's3_vectors']:
                        if key in vec['metadata']:
                            del vec['metadata'][key]
                    
                    created_on = dynamic_metadata.get('created_on')
                    document_effective_date = dynamic_metadata.get('document_effective_date')
                    contract_requester = dynamic_metadata.get('contract_requester')
                    reviewing_attorney = dynamic_metadata.get('reviewing_attorney')
                    client_account = dynamic_metadata.get('client_account')
                    parent_contract = dynamic_metadata.get('parent_contract')

                    vec['metadata']['dates'] = [date for date in [created_on, document_effective_date] if date is not None]
                    vec['metadata']['attorneys'] = [attorney for attorney in [contract_requester, reviewing_attorney] if attorney is not None]
                    vec['metadata']['client_account'] = [account for account in [client_account, parent_contract] if account is not None]

                    for old_key in ['created_on', 'document_effective_date', 'contract_requester', 'reviewing_attorney','client_account', 'parent_contract']:
                        if old_key in vec['metadata']:
                            del vec['metadata'][old_key]

                else:
                    print(f"No s3_path found in vector metadata: {vec.get('metadata')}")

            except Exception as e:
                print(f"Error processing vector {vec.get('vectorKey')}: {e}")

        response = insert_vectors_batch(target_index, bucket_name, vectors)

        if response is not None:
            print(f"Inserted batch of {len(vectors)} vectors to {target_index}")
        else:
            print("Failed to insert batch to target index.")

        if not next_token:
            print("Completed migrating all vectors.")
            break


df = pd.read_csv('gathered_contract_files_enriched.csv', dtype={"parent_contract": str})
migrate_vectors_with_metadata_update(SOURCE_INDEX, TARGET_INDEX, VECTOR_BUCKET_NAME, df)

Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted batch of 500 vectors to token-chunking-metadata-enriched
Inserted b

In [21]:
import numpy as np
EMBEDINNGS_URL = "https://zgggzg2iqg.execute-api.us-east-1.amazonaws.com/dev/get_embeddings"
API_KEY = "2jIpWCyNRg3Y8lkbmWG0tkyXwYlJn5QaZ1F3yKf7"

def get_text_embedding(texts, model='e5_mistral_embed_384'):
    if isinstance(texts, str):
        texts = [texts]
        
    if not isinstance(texts, list) or not texts:
        raise ValueError("Input 'texts' must be a non-empty list of strings.")

    embeddings = []

    for text in texts:
        if not isinstance(text, str):
            raise ValueError("Each item in 'texts' must be a string.")

        payload = {
            "model_name": model,
            "texts": [text]
        }

        headers = {
        "x-api-key": API_KEY
        }

        try:
            response = requests.post(EMBEDINNGS_URL, json=payload, headers=headers)
            response.raise_for_status()

            raw_body = response.json().get('body')

            parsed_body = json.loads(raw_body)

            embedding = parsed_body.get('embeddings')
            if not embedding or not isinstance(embedding, list) or len(embedding) != 1:
                raise KeyError(f"No valid embedding found in response for text: '{text}'")

            # embeddings.append(embedding[0])
            embedding_float32 = np.array(embedding[0], dtype=np.float32).tolist()
            embeddings.append(embedding_float32)
        except Exception as e:
            print(f"[ERROR] Failed to get embedding!") #for '{text}': {e}")
            embeddings.append(None)

    return embeddings[0] if len(embeddings) == 1 else embeddings


def query_s3_vector_store(query_text, client_account_filter, INDEX_NAME, top_k = 5):
    print(f"\n--- Processing Query: '{query_text}' ---")

    query_embedding = get_text_embedding(query_text)
    filter_expression = None
    if client_account_filter is not None:
        filter_expression = {
            "client_account": {
                "$eq": client_account_filter
            }
        }

    try:
        response = s3v.query_vectors(
            vectorBucketName=VECTOR_BUCKET_NAME,
            indexName=INDEX_NAME,
            topK=top_k,
            queryVector={
                'float32': query_embedding
            },
            returnMetadata=True,
            returnDistance=True,
            filter=filter_expression
        )
        return response
    except Exception as e:
        print(f"Error querying S3 Vector Store: {e}")
        return None

user_questions = [
    "What obligations does Cotiviti have under Schedule C for Prepay FWAV Services?", #66179
    "Under Schedule C, what services is Cotiviti required to provide?", #66179
    "In the Prepay FWAV Services section, what are Cotiviti's main deliverables?", #66179
    "What restrictions are placed on disclosing confidential information?", #67566
    "What is Amendment #4 to the Verisk Health License Agreement about?", #53985
    "What is the purpose of Amendment #4 as stated in the document?", #53985
]

INDEX_NAME = "token-chunking-metadata-enriched"
client = None #'UST Global'

for question in user_questions:
    query_results = query_s3_vector_store(question, client, INDEX_NAME, top_k=5)

    if query_results and 'vectors' in query_results:
        print(f"Retrieved {len(query_results['vectors'])}")
        for i, chunk_data in enumerate(query_results['vectors']):
            print(f"    Metadata {i+1}: {chunk_data['metadata']}")
            # print(f"    Distance: {chunk_data.get('distance', 'N/A')}")
    else:
        print(f"No results or an error occurred for query: '{question}'")



--- Processing Query: 'What obligations does Cotiviti have under Schedule C for Prepay FWAV Services?' ---
Retrieved 5
    Metadata 1: {'status_reason': 'Fully-Executed/Complete', 'text': '1  \n  \n   \n \n \n \nSTATEMENT OF WORK  \n \nPrepared for  \n \nCotiviti, Inc.', 's3_path': 's3://ml-legal-restricted/contract-docs/58295/Dynamics_58295__Please_review_and_sign_your_d_signed.pdf', 'contract_title': 'SOW to MSA (12/2/2019)', 'attorneys': ['Natalie Wise', 'Mahalakshmi Vyakaranam'], 'dates': ['2019-12-10', '2020-01-01'], 'document_type': 'SOW'}
    Metadata 2: {'dates': ['2019-12-10', '2020-01-01'], 'attorneys': ['Natalie Wise', 'Mahalakshmi Vyakaranam'], 'document_type': 'SOW', 'status_reason': 'Fully-Executed/Complete', 'contract_title': 'SOW to MSA (12/2/2019)', 's3_path': 's3://ml-legal-restricted/contract-docs/58295/ATKOTT Inc - Cotiviti - SOW - MV - 01102020 - Dynamics_58295.pdf', 'text': '1  \n  \n   \n \n \n \nSTATEMENT OF WORK  \n \nPrepared for  \n \nCotiviti, Inc.'}
    Me