In [1]:
# !pip install --upgrade boto3==1.39.8
import io
import os
import random
import nltk
import boto3
import requests
import json
from io import BytesIO
import tempfile
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows

S3_BUCKET = "ml-legal-restricted"
EXCEL_PATH = "Active Legal Contracts 8-1-2025 10-54-06 AM.xlsx"
VECTOR_BUCKET_NAME = "legal-docs-vector-store"
SOURCE_INDEX = 'token-chunking-valid'
TARGET_INDEX = 'token-chunking-valid-modified'

nltk.download('punkt')
nltk.download('punkt_tab')
s3 = boto3.client('s3')
s3v = boto3.client("s3vectors", region_name="us-east-1")

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
import csv

input_file = 'merged_files_data.csv'
output_file = 'files_per_client_summary.csv'

unique_accounts = {}

with open(input_file, mode='r', newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    
    for row in reader:
        account_name = row.get('account_name')
        account_type = row.get('account_type')
        
        if account_name and account_type:
            if account_name not in unique_accounts:
                unique_accounts[account_name] = account_type

with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['account_name', 'account_type'])
    
    for name, acc_type in unique_accounts.items():
        writer.writerow([name, acc_type])

print(f"Unique accounts written to '{output_file}'")


Unique accounts written to 'files_per_client_summary.csv'


In [None]:
import pandas as pd
from urllib.parse import urlparse, urlunparse, unquote
import posixpath

EXCEL_KEY = "Active Legal Contracts 8-1-2025 10-54-06 AM.xlsx"
SHEET_NAME = "Active Legal Contracts"
CSV_FILE = "gathered_contract_files_enriched.csv"
CSV_VALID = "gathered_contract_files_valid.csv"

def normalize_url(u: str) -> str:
    """
    Normalize URLs so that small formatting differences don't break matches.
    - Strip whitespace
    - Lowercase scheme/host
    - Decode % encodings on the path (so %20 -> space)
    - Remove trailing slashes on the path
    """
    if not isinstance(u, str):
        return ""
    u = u.strip()
    if not u:
        return ""
    try:
        p = urlparse(u)
        scheme = (p.scheme or "").lower()
        netloc = (p.netloc or "").lower()
        path = unquote(p.path or "")
        if path.endswith("/") and len(path) > 1:
            path = path.rstrip("/")
        return urlunparse((scheme, netloc, path, p.params, p.query, p.fragment))
    except Exception:
        return u

df_xl = pd.read_excel(EXCEL_KEY, sheet_name=SHEET_NAME, dtype=str)
mask = df_xl['Document URL'].notna() & (df_xl['Document URL'].str.strip() != "")
df_xl = df_xl.loc[mask].copy()

df_csv = pd.read_csv(CSV_FILE, dtype=str)
if 'document_url' not in df_csv.columns:
    raise KeyError("CSV is missing 'document_url' column")

df_csv['document_url'] = df_csv['document_url'].fillna('').astype(str).apply(normalize_url)

df_xl['__norm_url'] = df_xl['Document URL'].apply(normalize_url)
excel_urls = set(df_xl['__norm_url'])
csv_urls = set(df_csv['document_url'])
matched_urls = excel_urls & csv_urls

print(f"Excel non-empty rows: {len(df_xl)}")
print(f"Unique Excel URLs (normalized): {len(excel_urls)}")
print(f"Unique CSV URLs (normalized): {len(csv_urls)}")
print(f"Matches (normalized URL equality): {len(matched_urls)}")

def last_segment(u: str) -> str:
    if not isinstance(u, str) or not u.strip():
        return ""
    try:
        p = urlparse(u)
        seg = posixpath.basename(unquote(p.path or "")) or ""
        return seg.strip()
    except Exception:
        return ""

def normalize_filename(s: str) -> str:
    if not isinstance(s, str):
        return "", ""
    s = s.strip()
    if s.startswith(","):
        s = s[1:].strip()
    return s, s.casefold()

if 'file_name' not in df_csv.columns:
    raise KeyError("CSV is missing 'file_name' column")

df_csv['__url_file'] = df_csv['document_url'].apply(last_segment)

exact_file, ci_file = zip(*df_csv['file_name'].apply(normalize_filename))
df_csv['__file_exact'] = list(exact_file)
df_csv['__file_ci'] = list(ci_file)

exact_url_file, ci_url_file = zip(*df_csv['__url_file'].apply(normalize_filename))
df_csv['__url_exact'] = list(exact_url_file)
df_csv['__url_ci'] = list(ci_url_file)

mask_exact = (df_csv['__file_exact'] != "") & (df_csv['__file_exact'] == df_csv['__url_exact'])
mask_ci = (df_csv['__file_ci'] != "") & (df_csv['__file_ci'] == df_csv['__url_ci'])

mask_valid = mask_exact | (~mask_exact & mask_ci)

print(f"File name == URL last segment (exact): {mask_exact.sum()}")
print(f"File name == URL last segment (case-insensitive): {(mask_ci & ~mask_exact).sum()}")
print(f"Total valid matches: {mask_valid.sum()}")

valid_entries = df_csv.loc[mask_valid].drop(
    columns=['__url_file','__file_exact','__file_ci','__url_exact','__url_ci'],
    errors='ignore'
)
valid_entries.to_csv(CSV_VALID, index=False)

print(f"Valid entries saved to: {CSV_VALID}")


  warn(msg)


Excel non-empty rows: 14644
Unique Excel URLs (normalized): 14638
Unique CSV URLs (normalized): 539
Matches (normalized URL equality): 538
File name == URL last segment (exact): 506
File name == URL last segment (case-insensitive): 0
Total valid matches: 506
Cleaned CSV saved to: gathered_contract_files_enriched_new.csv
Valid entries saved to: gathered_contract_files_valid.csv


In [13]:
INDEX_NAME = 'token-chunking-vectors-poc' #'token-chunking-new-files' #'token-chunking-valid'

paginator = s3v.get_paginator('list_vectors')

total_vectors_count = 0

page_iterator = paginator.paginate(
    vectorBucketName=VECTOR_BUCKET_NAME,
    indexName=INDEX_NAME,
    returnData=True,
    returnMetadata=True,
    PaginationConfig={
        'PageSize': 1000  
    }
)

for page in page_iterator:
    vectors = page.get('vectors', [])
    total_vectors_count += len(vectors)

print(f"Total chunks vector & stored: {total_vectors_count}")


Total chunks vector & stored: 12591


In [None]:
def get_additional_metadata(s3_full_path, merged_df):
    try:
        row = merged_df[merged_df['s3_full_path'] == s3_full_path]
        if row.empty:
            print(f"No metadata found for s3_full_path: {s3_full_path}")
            return {}

        additional_metadata_keys = [
            'parent_contract', 'status_reason', 'solution_line', 'contract_title', 'contract_requester',
            'reviewing_attorney', 'created_on', 'document_effective_date', 'account_type', 'related_product'
        ]

        additional_metadata = row[additional_metadata_keys].iloc[0].to_dict()

        for k, v in additional_metadata.items():
            if pd.isna(v):
                additional_metadata[k] = None
            elif hasattr(v, 'strftime'):
                additional_metadata[k] = v.strftime('%Y-%m-%d')
            else:
                additional_metadata[k] = str(v)

        return {k: v for k, v in additional_metadata.items() if v is not None}

    except Exception as e:
        print(f"Error in get_additional_metadata for {s3_full_path}: {e}")
        return {}


def list_vectors_batch(index_name, bucket_name, max_results=500, next_token=None):
    try:
        params = {
            'indexName': index_name,
            'vectorBucketName': bucket_name,
            'maxResults': max_results,
            'returnData': True,
            'returnMetadata': True
        }
        if next_token:
            params['nextToken'] = next_token

        response = s3v.list_vectors(**params)
        return response.get('vectors', []), response.get('nextToken')

    except Exception as e:
        print(f"Error in list_vectors_batch: {e}")
        return [], None


def insert_vectors_batch(index_name, bucket_name, vectors):
    try:
        entries = [{
            'key': v['key'],
            'data': v['data'],
            'metadata': v['metadata']
        } for v in vectors]

        response = s3v.put_vectors(
            indexName=index_name,
            vectorBucketName=bucket_name,
            vectors=entries
        )
        return response

    except Exception as e:
        print(f"Error in insert_vectors_batch: {e}")
        return None


def migrate_vectors_with_metadata_update(source_index, target_index, bucket_name, df, batch_size=500):
    next_token = None

    while True:
        vectors, next_token = list_vectors_batch(source_index, bucket_name, batch_size, next_token)
        if not vectors:
            print("No vectors found in this batch. Ending migration.")
            break

        for vec in vectors:
            try:
                if 'metadata' not in vec or vec['metadata'] is None:
                    vec['metadata'] = {}

                s3_path = vec['metadata'].get('s3_path', '').strip()
                
                if s3_path:
                    dynamic_metadata = get_additional_metadata(s3_path, df)
                    vec['metadata'].update(dynamic_metadata)

                    for key in ['file_name', 'contract_number', 'opensearch', 's3_vectors']:
                        if key in vec['metadata']:
                            del vec['metadata'][key]
                    
                    created_on = dynamic_metadata.get('created_on', 'None')
                    document_effective_date = dynamic_metadata.get('document_effective_date', 'None')
                    contract_requester = dynamic_metadata.get('contract_requester', 'None')
                    reviewing_attorney = dynamic_metadata.get('reviewing_attorney', 'None')
                    account_name = vec['metadata'].get('client_account', 'None')
                    parent_contract = dynamic_metadata.get('parent_contract', 'None')
                    account_type = vec['metadata'].get('account_type', 'None')
                    related_product = vec['metadata'].get('related_product', 'None')
                    #add document title
                    #check for missings
                    # document_title = dynamic_metadata.get('document_title', 'None')
                    # vec['metadata']['dates'] = [date for date in [created_on, document_effective_date, document_title]]


                    vec['metadata']['dates'] = [date for date in [created_on, document_effective_date]]
                    vec['metadata']['attorneys'] = [attorney for attorney in [contract_requester, reviewing_attorney]]
                    vec['metadata']['account_details'] = [account for account in [account_name, parent_contract, account_type, related_product]]

                    old_keys = ['created_on', 'document_effective_date', 'contract_requester', 'reviewing_attorney','client_account', 'parent_contract', 'account_type', 'related_product']
                    for old_key in old_keys:
                        if old_key in vec['metadata']:
                            del vec['metadata'][old_key]

                else:
                    print(f"No s3_path found in vector metadata: {vec.get('metadata')}")

            except Exception as e:
                print(f"Error processing vector {vec.get('vectorKey')}: {e}")

        response = insert_vectors_batch(target_index, bucket_name, vectors)

        if response is not None:
            print(f"Inserted batch of {len(vectors)} vectors to {target_index}")
        else:
            print("Failed to insert batch to target index.")

        if not next_token:
            print("Completed migrating all vectors.")
            break


df = pd.read_csv('gathered_contract_files_enriched.csv', dtype={"parent_contract": str})
migrate_vectors_with_metadata_update(SOURCE_INDEX, TARGET_INDEX, VECTOR_BUCKET_NAME, df)

In [12]:
#copy entire chunk (vectors + metadata) to another vector index

def copy_s3_vectors_in_batches(
    source_index_name,
    source_bucket_name,
    dest_index_name,
    dest_bucket_name,
    batch_size=500
):

    next_token = None
    total_copied = 0

    while True:
        list_params = {
            'indexName': source_index_name,
            'vectorBucketName': source_bucket_name,
            'maxResults': batch_size,
            'returnData': True,
            'returnMetadata': True,
        }
        if next_token:
            list_params['nextToken'] = next_token

        try:
            response = s3v.list_vectors(**list_params)
        except Exception as e:
            print(f"Error listing vectors: {e}")
            break

        vectors = response.get('vectors', [])
        if not vectors:
            print("No more vectors to copy.")
            break

        batch = [{
            'key': v['key'],
            'data': v['data'],
            'metadata': v['metadata']
        } for v in vectors]

        try:
            s3v.put_vectors(
                # indexName=dest_index_name,
                indexArn = 'arn:aws:s3vectors:us-east-1:254281203237:bucket/legal-docs-vectors/index/token-chunking-vectors-poc',
                # vectorBucketName='legal-docs-vectors',
                vectors=batch
            )
            total_copied += len(batch)
            print(f"Copied batch of {len(batch)} vectors, total copied: {total_copied}")
        except Exception as e:
            print(f"Error inserting vectors: {e}")
            break

        next_token = response.get('nextToken')
        if not next_token:
            print(f"Completed copying all vectors, total count: {total_copied}")
            break


copy_s3_vectors_in_batches(
    source_index_name='token-chunking-vectors-poc',
    source_bucket_name=VECTOR_BUCKET_NAME,
    dest_index_name="token-chunking-vectors-poc",
    dest_bucket_name=VECTOR_BUCKET_NAME,
)


Copied batch of 500 vectors, total copied: 500
Copied batch of 500 vectors, total copied: 1000
Copied batch of 500 vectors, total copied: 1500
Copied batch of 500 vectors, total copied: 2000
Copied batch of 500 vectors, total copied: 2500
Copied batch of 500 vectors, total copied: 3000
Copied batch of 500 vectors, total copied: 3500
Copied batch of 500 vectors, total copied: 4000
Copied batch of 500 vectors, total copied: 4500
Copied batch of 500 vectors, total copied: 5000
Copied batch of 500 vectors, total copied: 5500
Copied batch of 500 vectors, total copied: 6000
Copied batch of 500 vectors, total copied: 6500
Copied batch of 500 vectors, total copied: 7000
Copied batch of 500 vectors, total copied: 7500
Copied batch of 500 vectors, total copied: 8000
Copied batch of 500 vectors, total copied: 8500
Copied batch of 500 vectors, total copied: 9000
Copied batch of 500 vectors, total copied: 9500
Copied batch of 500 vectors, total copied: 10000
Copied batch of 500 vectors, total copie

In [None]:
#merge two csv's
import pandas as pd

df1 = pd.read_csv('gathered_contract_files_valid_modified.csv')
df2 = pd.read_csv('gathered_contract_files_valid_new.csv')

df2 = df2[df1.columns]

merged_df = pd.concat([df1, df2], ignore_index=True)

merged_df.to_csv('merged_files_data.csv', index=False)


In [None]:
#verify metadata by running the flow
 
import numpy as np
EMBEDINNGS_URL = "https://zgggzg2iqg.execute-api.us-east-1.amazonaws.com/dev/get_embeddings"
API_KEY = "2jIpWCyNRg3Y8lkbmWG0tkyXwYlJn5QaZ1F3yKf7"

def _extract_embeddings_obj(obj):

    if isinstance(obj, dict) and "embeddings" in obj:
        return obj["embeddings"]

    # Case B: wrapper with body string
    if isinstance(obj, dict) and "body" in obj:
        try:
            body = obj["body"]
            if isinstance(body, str):
                inner = json.loads(body)
            else:
                inner = body
            if isinstance(inner, dict) and "embeddings" in inner:
                return inner["embeddings"]
        except Exception:
            pass

    raise KeyError("No 'embeddings' found in response object")

def get_text_embedding(texts, model='e5_mistral_embed_384', timeout=8):
    if isinstance(texts, str):
        texts = [texts]
    if not isinstance(texts, list) or not texts:
        raise ValueError("Input 'texts' must be a non-empty list of strings.")

    headers = {
        "x-api-key": API_KEY,
        "Content-Type": "application/json"
    }

    out = []
    for text in texts:
        if not isinstance(text, str):
            raise ValueError("Each item in 'texts' must be a string.")

        payload = {"model_name": model, "texts": [text]}

        try:
            resp = requests.post(EMBEDINNGS_URL, json=payload, headers=headers, timeout=timeout)
            resp.raise_for_status()

            # Try both shapes
            obj = resp.json()
            embeddings = _extract_embeddings_obj(obj)

            if (not isinstance(embeddings, list)) or len(embeddings) != 1 or (not isinstance(embeddings[0], list)):
                raise KeyError("Response did not contain a single embedding vector")

            # Convert to float32 for consistency / memory
            vec = np.array(embeddings[0], dtype=np.float32).tolist()
            out.append(vec)

        except Exception as e:
            # Print useful diagnostics once
            print(f"[ERROR] Failed to get embedding for '{text}': {e}")
            try:
                print(f"[DEBUG] HTTP {resp.status_code} body: {resp.text[:500]}")
            except Exception:
                pass
            out.append(None)

    return out[0] if len(out) == 1 else out

print(get_text_embedding(['Taher Hellot']))



def query_s3_vector_store(query_text, client_account_filter, INDEX_NAME, top_k = 5):
    print(f"\n--- Processing Query: '{query_text}' ---")

    query_embedding = get_text_embedding(query_text)
    filter_expression = None
    if client_account_filter is not None:
        filter_expression = {
            "client_account": {
                "$eq": client_account_filter
            }
        }

    try:
        response = s3v.query_vectors(
            vectorBucketName=VECTOR_BUCKET_NAME,
            indexName=INDEX_NAME,
            topK=top_k,
            queryVector={
                'float32': query_embedding
            },
            returnMetadata=True,
            returnDistance=True,
            filter=filter_expression
        )
        return response
    except Exception as e:
        print(f"Error querying S3 Vector Store: {e}")
        return None

user_questions = [
    "What obligations does Cotiviti have under Schedule C for Prepay FWAV Services?", #66179
    "Under Schedule C, what services is Cotiviti required to provide?", #66179
    "In the Prepay FWAV Services section, what are Cotiviti's main deliverables?", #66179
    "What restrictions are placed on disclosing confidential information?", #67566
    "What is Amendment #4 to the Verisk Health License Agreement about?", #53985
    "What is the purpose of Amendment #4 as stated in the document?", #53985
]

INDEX_NAME = "token-chunking-vectors-poc"
client = None #'UST Global'

for question in user_questions:
    query_results = query_s3_vector_store(question, client, INDEX_NAME, top_k=5)

    if query_results and 'vectors' in query_results:
        print(f"Retrieved {len(query_results['vectors'])}")
        for i, chunk_data in enumerate(query_results['vectors']):
            print(f"    Metadata {i+1}: {chunk_data['metadata']}")
            # print(f"    Distance: {chunk_data.get('distance', 'N/A')}")
    else:
        print(f"No results or an error occurred for query: '{question}'")


In [None]:
#time pass billing for SageMaker

import boto3

REGION = "us-east-1"  # Change if needed
sm = boto3.client("sagemaker", region_name=REGION)

grand_total_gb = 0
domain_totals = []

# 1. List all domains
domains = sm.list_domains().get("Domains", [])

for d in domains:
    domain_id = d["DomainId"]
    domain_name = d["DomainName"]

    domain_total_gb = 0
    spaces = []

    # 2. Paginate over all spaces in the domain
    paginator = sm.get_paginator("list_spaces")
    for page in paginator.paginate(DomainIdEquals=domain_id):
        for s in page.get("Spaces", []):
            size_gb = (
                s.get("SpaceSettingsSummary", {})
                 .get("SpaceStorageSettings", {})
                 .get("EbsStorageSettings", {})
                 .get("EbsVolumeSizeInGb", 0)
            )
            domain_total_gb += size_gb
            spaces.append({
                "SpaceName": s.get("SpaceName"),
                "Status": s.get("Status"),
                "AppType": s.get("SpaceSettingsSummary", {}).get("AppType"),
                "EbsVolumeSizeInGb": size_gb
            })

    grand_total_gb += domain_total_gb
    domain_totals.append({
        "DomainName": domain_name,
        "DomainId": domain_id,
        "TotalGB": domain_total_gb,
        "Spaces": spaces
    })

# 3. Print results
for d in domain_totals:
    print(f"\nDomain: {d['DomainName']} ({d['DomainId']})")
    print(f"  Total gp3 volume: {d['TotalGB']} GB")
    for s in sorted(d["Spaces"], key=lambda x: x["EbsVolumeSizeInGb"], reverse=True):
        print(f"    - {s['SpaceName']}: {s['EbsVolumeSizeInGb']} GB (status={s['Status']}, app={s['AppType']})")

print(f"\n==== GRAND TOTAL gp3 volume across all domains: {grand_total_gb} GB ====")


In [None]:
#create or delete vectors

s3v = boto3.client("s3vectors", region_name="us-east-1")
response = s3v.list_indexes(
    # vectorBucketName='legal-docs-vectors',
    vectorBucketArn = 'arn:aws:s3vectors:us-east-1:254281203237:bucket/legal-docs-vectors'
)
print(response)

INDEX_NAME = 'token-chunking-vectors-poc'
VECTOR_DIM = 384
DISTANCE_METRIC = "cosine"
NON_FILTERABLE_KEYS = ['text']

# response = s3v.delete_index(
#     vectorBucketName=VECTOR_BUCKET_NAME,
#     indexName=INDEX_NAME,
# )

response = s3v.create_index(
    vectorBucketArn = 'arn:aws:s3vectors:us-east-1:254281203237:bucket/legal-docs-vectors',
    indexName=INDEX_NAME,
    dataType="float32",
    dimension=VECTOR_DIM,
    distanceMetric=DISTANCE_METRIC,
    metadataConfiguration={
        "nonFilterableMetadataKeys": NON_FILTERABLE_KEYS
    }
)

print(f"Created index: {response}")

{'ResponseMetadata': {'RequestId': '0891ea5e-35bc-4b90-ad7d-191136b79571', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Tue, 12 Aug 2025 16:48:16 GMT', 'content-type': 'application/json', 'content-length': '14', 'connection': 'keep-alive', 'x-amz-request-id': '0891ea5e-35bc-4b90-ad7d-191136b79571', 'access-control-allow-origin': '*', 'vary': 'origin, access-control-request-method, access-control-request-headers', 'access-control-expose-headers': '*'}, 'RetryAttempts': 0}, 'indexes': []}
Created index: {'ResponseMetadata': {'RequestId': '6c887001-74b8-4ed2-bd46-f04560647c0a', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Tue, 12 Aug 2025 16:48:16 GMT', 'content-type': 'application/json', 'content-length': '2', 'connection': 'keep-alive', 'x-amz-request-id': '6c887001-74b8-4ed2-bd46-f04560647c0a', 'access-control-allow-origin': '*', 'vary': 'origin, access-control-request-method, access-control-request-headers', 'access-control-expose-headers': '*'}, 'Ret