In [None]:
# !pip install --upgrade boto3==1.39.8
# !sudo apt install antiword 
import io
import uuid
import os
import random
import nltk
import boto3
import requests
import json
import pytesseract
import textract
from pdf2image import convert_from_bytes
from PyPDF2 import PdfReader
from docx import Document
from io import BytesIO
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModel
import extract_msg
import tempfile
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.schema import TextNode, NodeRelationship, RelatedNodeInfo
from sentence_transformers import SentenceTransformer

# embed_model = AutoModel.from_pretrained("intfloat/e5-small-v2")

S3_BUCKET = "ml-legal-restricted"
EXCEL_PATH = "full_contracts_with_files.xlsx"
VECTOR_BUCKET_NAME = "legal-docs-vector-store"
EMBEDINNGS_URL = "https://zgggzg2iqg.execute-api.us-east-1.amazonaws.com/dev/get_embeddings"
API_KEY = "*******"

nltk.download('punkt')
nltk.download('punkt_tab')
s3 = boto3.client('s3')
s3v = boto3.client("s3vectors", region_name="us-east-1")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def download_excel(bucket, key, sheet_name="Active Legal Contracts", column="Contract Number"):
    print(f"📥 Downloading Excel: s3://{bucket}/{key}")
    obj = s3.get_object(Bucket=bucket, Key=key)
    excel_data = obj['Body'].read()

    df = pd.read_excel(io.BytesIO(excel_data), sheet_name=sheet_name, engine='openpyxl')
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in sheet '{sheet_name}'")

    contract_numbers = df[column].dropna().astype(str).str.strip().tolist()
    return contract_numbers, df, excel_data


def list_s3_files_for_contract(bucket, contract_number, prefix_base="contract-docs/"):
    prefix = f"{prefix_base}{contract_number}/"
    files = []
    paginator = s3.get_paginator("list_objects_v2")

    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            files.append(obj["Key"])
    return files


def add_modified_sheet_with_files(wb, original_df, file_map):
    ws = wb.create_sheet("Active Legal Contracts + Files")
    headers = original_df.columns.tolist()
    max_files = max((len(files) for files in file_map.values()), default=0)
    headers += [f"File {i+1}" for i in range(max_files)]
    ws.append(headers)

    for idx, row in original_df.iterrows():
        contract_number = str(row["Contract Number"]).strip()
        files = [os.path.basename(f) for f in file_map.get(contract_number, [])]
        base_row = row.tolist()
        ws.append(base_row + files)


def add_s3_paths_sheet(wb, file_map, bucket):
    ws = wb.create_sheet("S3 File Paths")
    max_files = max((len(files) for files in file_map.values()), default=0)

    headers = ["Contract Number"] + [f"S3 File {i+1}" for i in range(max_files)]
    ws.append(headers)

    for contract, keys in file_map.items():
        s3_paths = [f"s3://{bucket}/{key}" for key in keys]
        ws.append([contract] + s3_paths)

def save_missing_contracts_to_csv(missing_contracts, output_path="missing_contracts.csv"):
    if not missing_contracts:
        print("✅ No missing contracts to save.")
        return

    df = pd.DataFrame(missing_contracts, columns=["Contract Number"])
    df.to_csv(output_path, index=False)
    print(f"📄 Missing contracts CSV saved to: {output_path}")
    
def process_entire_bucket(bucket, excel_key, output_path="full_contracts_with_files.xlsx"):
    contract_numbers, original_df, excel_bytes = download_excel(bucket, excel_key)

    file_map = {}
    missing_contracts = []

    print(f"\n🔍 Scanning {len(contract_numbers)} contract numbers across S3...")
    for idx, number in enumerate(contract_numbers, 1):
        if idx % 1000 == 0 or idx == 1:
            print(f"🔢 [{idx}/{len(contract_numbers)}] Scanning: {number}")
        files = list_s3_files_for_contract(bucket, number)

        if not files and len(number) < 8 and number.isdigit():
            padded = number.zfill(8)
            print(f"   ➕ Retrying with padded contract number: {padded}")
            files = list_s3_files_for_contract(bucket, padded)
            
        if files:
            file_map[number] = files
        else:
            missing_contracts.append(number)

    print("\n🧾 Preparing final Excel workbook...")
    wb = load_workbook(io.BytesIO(excel_bytes))
    add_modified_sheet_with_files(wb, original_df, file_map)
    add_s3_paths_sheet(wb, file_map, bucket)
    wb.save(output_path)
    print(f"✅ Final Excel saved: {output_path}")

    print("\n=== Final Summary ===")
    print(f"📄 Total contracts processed: {len(contract_numbers)}")
    print(f"✅ Contracts with files: {len(file_map)}")
    print(f"❌ Contracts with NO files found: {len(missing_contracts)}")
    if missing_contracts:
        print(f"🔍 Sample missing contract numbers: {missing_contracts[:5]}{'...' if len(missing_contracts) > 5 else ''}")
        save_missing_contracts_to_csv(missing_contracts)

if __name__ == "__main__":
    bucket = "ml-legal-restricted"
    excel_key = "tabularData/Active Legal Contracts 7-10-2025 1-17-09 PM.xlsx"
    process_entire_bucket(bucket, excel_key, output_path="full_contracts_with_files.xlsx")


In [4]:
def get_text_embedding(texts, model='e5_mistral_embed_384'):
    if isinstance(texts, str):
        texts = [texts]
        
    if not isinstance(texts, list) or not texts:
        raise ValueError("Input 'texts' must be a non-empty list of strings.")

    embeddings = []

    for text in texts:
        if not isinstance(text, str):
            raise ValueError("Each item in 'texts' must be a string.")

        payload = {
            "model_name": model,
            "texts": [text]
        }

        headers = {
        "x-api-key": API_KEY
        }

        try:
            response = requests.post(EMBEDINNGS_URL, json=payload, headers=headers)
            response.raise_for_status()

            raw_body = response.json().get('body')

            parsed_body = json.loads(raw_body)

            embedding = parsed_body.get('embeddings')
            if not embedding or not isinstance(embedding, list) or len(embedding) != 1:
                raise KeyError(f"No valid embedding found in response for text: '{text}'")

            embeddings.append(embedding[0])
        except Exception as e:
            print(f"[ERROR] Failed to get embedding!") #for '{text}': {e}")
            embeddings.append(None)

    return embeddings[0] if len(embeddings) == 1 else embeddings


In [5]:
# to recreate vector index store

INDEX_NAME = "token-chunking"
# INDEX_NAME = "overlap-chunking"
# INDEX_NAME = "semantic-split-chunking"
VECTOR_DIM = 384
DISTANCE_METRIC = "cosine"
NON_FILTERABLE_KEYS = ['text']

response = s3v.delete_index(
    vectorBucketName=VECTOR_BUCKET_NAME,
    indexName=INDEX_NAME,
)

response = s3v.create_index(
    vectorBucketName=VECTOR_BUCKET_NAME,
    indexName=INDEX_NAME,
    dataType="float32",
    dimension=VECTOR_DIM,
    distanceMetric=DISTANCE_METRIC,
    metadataConfiguration={
        "nonFilterableMetadataKeys": NON_FILTERABLE_KEYS
    }
)

print(f"Created index: {response}")


Created index: {'ResponseMetadata': {'RequestId': '75e91389-acf4-4486-8cd8-834aa50348ce', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Wed, 23 Jul 2025 19:37:19 GMT', 'content-type': 'application/json', 'content-length': '2', 'connection': 'keep-alive', 'x-amz-request-id': '75e91389-acf4-4486-8cd8-834aa50348ce', 'access-control-allow-origin': '*', 'vary': 'origin, access-control-request-method, access-control-request-headers', 'access-control-expose-headers': '*'}, 'RetryAttempts': 0}}


In [10]:
def chunk_text(text, metadata, token_limit=400, tokenizer_name="intfloat/e5-small-v2"):
    
    s3_path = metadata.get("s3_path", "")
    file_name = os.path.basename(s3_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    sentences = sent_tokenize(text)
    chunks, current, count = [], [], 0

    for sent in sentences:
        toks = tokenizer.tokenize(sent)
        if count + len(toks) > token_limit and current:
            chunks.append(" ".join(current))
            current, count = [], 0
        current.append(sent)
        count += len(toks)

    if current:
        chunks.append(" ".join(current))

    return [
        {
            "key": str(uuid.uuid4()),
            "metadata": {
                "text": chunk,
                "file_name": file_name,
                "s3_path": s3_path,
                "client_account": metadata.get("client_account"),
                "document_type": metadata.get("document_type")
            }
        }
        for chunk in chunks
    ]
    

def chunk_text_with_overlap(text, metadata, token_limit=400, chunk_overlap=50, tokenizer_name="intfloat/e5-small-v2"):
    
    s3_path = metadata.get("s3_path", "")
    file_name = os.path.basename(s3_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    sentences, chunks, curr, curr_toks = sent_tokenize(text), [], [], 0

    for sent in sentences:
        stoks = len(tokenizer.tokenize(sent))
        if stoks > token_limit:
            words = sent.split()
            i = 0
            while i < len(words):
                segment = words[i:i+token_limit]
                chunks.append(" ".join(segment))
                i += token_limit - chunk_overlap
            continue

        if curr_toks + stoks <= token_limit:
            curr.append(sent); curr_toks += stoks
        else:
            chunks.append(" ".join(curr))
            # build overlap
            overlap, tot = [], 0
            for s in reversed(curr):
                l = len(tokenizer.tokenize(s))
                if tot + l > chunk_overlap:
                    break
                overlap.insert(0, s); tot += l
            curr = overlap + [sent]
            curr_toks = sum(len(tokenizer.tokenize(s)) for s in curr)

    if curr:
        chunks.append(" ".join(curr))
    
    return [
        {
            "key": str(uuid.uuid4()),
            "metadata": {
                "text": chunk,
                "file_name": file_name,
                "s3_path": s3_path,
                "client_account": metadata.get("client_account"),
                "document_type": metadata.get("document_type")
            }
        }
        for chunk in chunks
    ]

#TODO ebedding model integgratiom
def chunk_with_semantic_split(text, data, buffer_size=1, breakpoint_percentile_threshold=95):

    from llama_index.core import Document

    s3_path = data.get("s3_path", "")
    metadata = {
        "s3_path": s3_path,
        "file_name": os.path.basename(s3_path),
        "client_account": data.get("client_account"),
        "document_type": data.get("document_type")
                }
    doc = Document(text=text, metadata=metadata)

    parser = SemanticSplitterNodeParser.from_defaults(
        embed_model=embed_model, #need to fix this
        buffer_size=buffer_size,
        breakpoint_percentile_threshold=breakpoint_percentile_threshold,
        include_metadata=False,
        include_prev_next_rel=False
    )

    nodes = parser.get_nodes_from_documents([doc])
    texts = [n.text for n in nodes]

    result = []
    for n in nodes:
        unique_id = str(uuid.uuid4())
        result.append({
            "key": unique_id,
            "metadata": {
                "text": n.text,
                "s3_path": s3_path,
                "file_name": os.path.basename(s3_path)
            },
        })

    return result

In [11]:
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt", ".msg", ".doc"}

def list_supported_files(bucket, prefix=""):
    paginator = s3.get_paginator("list_objects_v2")
    all_files, supported, unsupported = [], [], []

    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            key = obj["Key"]
            all_files.append(key)
            ext = os.path.splitext(key)[1].lower()
            if ext in SUPPORTED_EXTENSIONS:
                supported.append(key)
            else:
                unsupported.append(key)

    print("\n=== File Summary ===")
    print(f"📁 Total files: {len(all_files)}")
    print(f"✅ Supported files: {len(supported)}")
    print(f"❌ Unsupported: {len(unsupported)} (Sample: {unsupported[:5]})\n")
    return supported

def download_s3_file(bucket, key):
    response = s3.get_object(Bucket=bucket, Key=key)
    return io.BytesIO(response["Body"].read())

def extract_from_doc(file_io):
    try:
        file_io.seek(0)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as temp_file:
            temp_file.write(file_io.read())
            temp_path = temp_file.name

        text = textract.process(temp_path).decode("utf-8").strip()
        os.remove(temp_path)
        return text
    except Exception as e:
        print(f"❗ DOC (textract) error: {e}")
        return ""


def extract_with_ocr(file_io):
    try:
        file_io.seek(0)
        images = convert_from_bytes(file_io.read())
        return "\n".join(pytesseract.image_to_string(img) for img in images).strip()
    except Exception as e:
        print(f"❗ OCR failed: {e}")
        return ""

def extract_from_pdf(file_io):
    try:
        reader = PdfReader(file_io)
        return "\n".join([p.extract_text() for p in reader.pages if p.extract_text()]).strip()
    except Exception as e:
        print(f"❗ PDF read error: {e}")
        return ""


def extract_from_docx(file_io):
    from docx import Document
    from io import BytesIO

    try:
        if hasattr(file_io, "read"):
            file_io.seek(0)
            content = file_io.read()
            bio = BytesIO(content)
            
            doc = Document(bio)
        else:

            bio = BytesIO(file_io)
            doc = Document(bio)
            
        text = "\n".join(p.text for p in doc.paragraphs).strip()
        return text
    except Exception as e:
        print(f"❗ DOCX error: {e}")
        return ""


def extract_from_txt(file_io):
    try:
        return file_io.read().decode("utf-8").strip()
    except Exception as e:
        print(f"❗ TXT read error: {e}")
        return ""

def extract_from_msg(file_io):
    try:
        with open("temp.msg", "wb") as f:
            f.write(file_io.read())
        msg = extract_msg.Message("temp.msg")
        text = msg.body or ""
        os.remove("temp.msg")
        return text.strip()
    except Exception as e:
        print(f"❗ MSG read error: {e}")
        return ""

def extract_text(file_io, ext):
    ext = ext.lower()
    
    file_io.seek(0)
    sig = file_io.read(4)
    file_io.seek(0)
    
    extractors = {
        ".pdf": [
            extract_from_pdf,
            extract_with_ocr
        ],
        ".docx": [
            extract_from_docx,
            extract_with_ocr
        ],
        ".doc": [extract_from_doc],
        ".txt": [extract_from_txt,],
        ".msg": [extract_from_msg],
    }

    for extractor in extractors.get(ext, []):
        file_io.seek(0)
        text = extractor(file_io)
        if text:
            return text, extractor.__name__

    return "", "none"


def upload_chunks_to_s3_vector_index(chunks, vector_bucket_name, chunking_strategy):

    MAX_BATCH_SIZE = 500
    texts = [chunk["metadata"]["text"] for chunk in chunks]
    keys = [chunk["key"] for chunk in chunks]

    embeddings = get_text_embedding(texts)

    vectors = []
    for i in range(len(chunks)):
        if embeddings[i] == None:
            continue
        vector_metadata = chunks[i]["metadata"].copy()
        vectors.append({
            "key": keys[i],
            "data": {"float32": embeddings[i]},
            "metadata": vector_metadata
        })

    responses = []
    for i in range(0, len(vectors), MAX_BATCH_SIZE):
        print(f"Uploading batch {i // MAX_BATCH_SIZE + 1} of {((len(vectors) - 1) // MAX_BATCH_SIZE + 1)}")
        batch = vectors[i:i + MAX_BATCH_SIZE]
        response = s3v.put_vectors(
            vectorBucketName=vector_bucket_name,
            indexName=chunking_strategy,
            vectors=batch
        )
        responses.append(response)

    return responses


def process_documents(bucket, items, chunking_strategy):
    stats = {
        "processed": 0, "failed": 0,
        "pdf": 0, "docx": 0, "txt": 0, "msg": 0, "doc": 0,
        "ocr": 0, "none": 0
    }

    all_chunks = []

    for idx, item  in enumerate(items, 1):
        file_key = item["file_key"]
        metadata = item.get("metadata", {}).copy()

        print(f"\n📄 Processing {idx}/{len(items)}: {file_key}")
        ext = os.path.splitext(file_key)[1].lower()
        metadata["s3_path"] = f"s3://{bucket}/{file_key}"

        try:
            file_io = download_s3_file(bucket, file_key)
            text, method = extract_text(file_io, ext)

            if not text:
                print("⚠️ No text extracted.")
                stats["failed"] += 1
                stats["none"] += 1
                continue
            
            if chunking_strategy == 'token-chunking':
                chunks = chunk_text(text, metadata)
            elif chunking_strategy == 'overlap-chunking':
                chunks = chunk_text_with_overlap(text, metadata)
            elif chunking_strategy == 'semantic-split-chunking':
                chunks = chunk_with_semantic_split(text, metadata)

            print(f"✅ Extracted {len(chunks)} chunks.")
            # sample = chunks[0]['text'][:80].replace('\n', ' ')
            # sample = chunks
            # print(f"📝 Sample: {sample}...")

            all_chunks.extend(chunks)
            stats["processed"] += 1
            stats[ext.replace(".", "")] += 1
            if "ocr" in method: stats["ocr"] += 1

        except Exception as e:
            print(f"❗ Error: {e}")
            stats["failed"] += 1
            stats["none"] += 1

    print("\n=== Summary ===")
    first_keys = ['processed', 'failed']
    first_line = []
    second_line = []

    for k, v in stats.items():
        key_formatted = k.capitalize().replace('_', ' ')
        pair = f"{key_formatted}: {v}"
        if k.lower() in first_keys:
            first_line.append(pair)
        else:
            second_line.append(pair)

    print("  ".join(first_line))
    print("  ".join(second_line))

    return stats, all_chunks

def read_s3_keys_from_excel(excel_path, sample_size, sheet_name="S3 File Paths"):
    print(f"📥 Reading S3 paths from Excel: {excel_path}, sheet: {sheet_name}")
    df = pd.read_excel(excel_path, sheet_name=sheet_name, engine="openpyxl")

    file_columns = [col for col in df.columns if col.startswith("S3 File")]

    if not file_columns:
        raise ValueError(f"No columns starting with 'S3 File' found in sheet '{sheet_name}'")

    all_paths = []
    for col in file_columns:
        for cell in df[col].dropna():
            path = str(cell).strip()
            if path:
                all_paths.append(path)

    print(f"🔍 Total paths found: {len(all_paths)}")

    valid_keys = []
    invalid_paths = []

    for path in all_paths:
        if path.startswith("s3://"):
            parts = path.replace("s3://", "").split("/", 1)
            if len(parts) == 2 and parts[1].strip():
                valid_keys.append(parts[1])
            else:
                invalid_paths.append(path)
        else:
            invalid_paths.append(path)

    print(f"📁 Parsed S3 keys: {len(valid_keys)}")

    if invalid_paths:
        print(f"⚠️ Skipped invalid paths: {len(invalid_paths)}")
        print("\n⚠️ Skipped Paths (sample):")
        for bad in invalid_paths[:10]:
            print(f" - {bad}")
        if len(invalid_paths) > 10:
            print(f" ...and {len(invalid_paths) - 10} more.")

    selected_keys = random.sample(valid_keys, min(sample_size, len(valid_keys)))
    print(f"🎯 Randomly selected {len(selected_keys)} files.")
    # return selected_keys

    print(f"📄 Reading metadata from sheet: 'Active Legal Contracts'")
    df_meta = pd.read_excel(excel_path, sheet_name="Active Legal Contracts", engine="openpyxl")
    meta_map = df_meta.set_index('Contract Number')[['Account', 'Document Type']].to_dict('index')

    result = []
    for key in selected_keys:
        parts = key.split("/")
        contract_number = parts[1] if len(parts) > 1 else None
        metadata = meta_map.get(int(contract_number), {}) if contract_number and contract_number.isdigit() else {}

        result.append({
            'file_key': key,
            'metadata': {
                'client_account': metadata.get('Account', 'NA'),
                'document_type': metadata.get('Document Type', 'NA')
            }
        })

    return result

if __name__ == "__main__":

    file_keys_with_meta = read_s3_keys_from_excel(EXCEL_PATH, sample_size=12)
    # file_keys = ['contract-docs/66179/Dynamics_-_66179_signed.pdf', 'contract-docs/77767/Dynamics_-_77767_signed.pdf', 'contract-docs/65062/33908_SIGNED_ESI CO# 3417pdf.pdf', 'contract-docs/57964/VNSNY-Cotiviti Amend No 3 to MSLA Client Release 3.0 (SBC 02.04.20).docx', 'contract-docs/55622/Highmark - SOW# 1 Prospective Claims (March 2016) [55622].pdf', 'contract-docs/80537/VRC EL - BEV Equity Valuation as of 12-31-22 - LEGAL APPROVED - 22Feb2023_Fully executed.pdf', 'contract-docs/64936/24380_WellCare CO 3266 - CO 12 to PQ 13 - Aetna LOBs and Plan Codes (8249).pdf', 'contract-docs/80577/Vena Solutions_NDA_03292017_vendor signed.pdf', 'contract-docs/67566/Ochsner - Cotiviti - NDA - MV - 02222022 - Dynamics_67566.pdf', 'contract-docs/54094/Isos-Cotiviti- PAD Migration SOW - ATS2018720 - Ready for Signature.pdf', 'contract-docs/59747/Cotiviti - NCQA License and Certification Agreement Amendment_8.17.20.pdf', 'contract-docs/64408/59062_SIGNED_Anthem CE - Ready Set Renew IVR SMS & Email Progam - SOW 880 Signature Ready 101921.pdf', 'contract-docs/00010217/Willis-Mosiac-Towers Watson DUA 01-22-2014.docx', 'contract-docs/53985/Partners Healthcare MSA Amendment 4 (DxCG) jsw 12.18.18 FINAL_signed.pdf', 'contract-docs/78927/Dynamics_78927_signed.pdf', 'contract-docs/59164/RStudio Renewal - Q-16631-20200402-1828 - Preeti Vaidya.pdf', 'contract-docs/71355/Dynamics_-_71355_signed.pdf', 'contract-docs/64596/28695_SIGNED_Boston Medical Center HealthNet Plan CO1 to SOW3 (8778) .pdf', 'contract-docs/61941/Dynamics_61941_Please_review_and_sign_your_do_signed.pdf', 'contract-docs/00037577/Re  Requisition for Invite Networks  quote # 324MJ-Q   SoJo switches.msg', 'contract-docs/62520/Approval Needed_ Temp Contract Extension (Compliance - Stinson, Leticia) - FP&A - 07.29.2021.pdf', 'contract-docs/00006573/Towers Watson-MI-NDA-Executed.pdf', 'contract-docs/75036/Dynamics_75036_signed.pdf', 'contract-docs/71954/InSync SOW Rochelle Gilpin .docx', 'contract-docs/74972/Brex - Cotiviti - NDA - MS - 07092024 - Dynamics_74972.pdf']

    for chunking_strategy in ["token-chunking"]: #, "overlap-chunking", "semantic-split-chunking"]:
        final_stats, chunks = process_documents(S3_BUCKET, file_keys_with_meta, chunking_strategy)
        if chunks:
            response = upload_chunks_to_s3_vector_index(chunks, VECTOR_BUCKET_NAME, chunking_strategy)
            if response:
                print(f"\n ✅ Successfully uploaded to {chunking_strategy} S3 vector Index")



📥 Reading S3 paths from Excel: full_contracts_with_files.xlsx, sheet: S3 File Paths
🔍 Total paths found: 47933
📁 Parsed S3 keys: 47933
🎯 Randomly selected 12 files.
📄 Reading metadata from sheet: 'Active Legal Contracts'
{'file_key': 'contract-docs/80694/Black Duck Software_NDA_12232014_fully executed.pdf', 'metadata': {'client_account': 'Black Duck Software', 'document_type': 'NDA'}}
{'file_key': 'contract-docs/78587/Cotiviti USA LLC GMI BDC FSWS Amended and Restated Schedule A May 2018.pdf', 'metadata': {'client_account': 'S&P Global Ratings', 'document_type': 'Amendment'}}
{'file_key': 'contract-docs/62422/Re Randstad SOW NO. 5 - RMI - ELT Approval.msg', 'metadata': {'client_account': 'Randstad Technologies, LLC', 'document_type': 'SOW'}}
{'file_key': 'contract-docs/55821/SOW_Cotiviti_DM_Sep19 (002).pdf', 'metadata': {'client_account': 'Anthem, Inc (WellPoint, Inc.)', 'document_type': 'SOW'}}
{'file_key': 'contract-docs/68386/Dynamics_68386_signed.pdf', 'metadata': {'client_account'

Token indices sequence length is longer than the specified maximum sequence length for this model (680 > 512). Running this sequence through the model will result in indexing errors


✅ Extracted 18 chunks.

📄 Processing 3/12: contract-docs/62422/Re Randstad SOW NO. 5 - RMI - ELT Approval.msg
✅ Extracted 1 chunks.

📄 Processing 4/12: contract-docs/55821/SOW_Cotiviti_DM_Sep19 (002).pdf
✅ Extracted 18 chunks.

📄 Processing 5/12: contract-docs/68386/Dynamics_68386_signed.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (617 > 512). Running this sequence through the model will result in indexing errors


✅ Extracted 11 chunks.

📄 Processing 6/12: contract-docs/70008/Humana Second Amended and Restated Schedule A redline 5-8-23.docx
✅ Extracted 31 chunks.

📄 Processing 7/12: contract-docs/66321/VERS-0167-01-00 SOW 87 Star Intelligence Development - Staff Aug FULLY EXECUTED.pdf
✅ Extracted 10 chunks.

📄 Processing 8/12: contract-docs/69010/GRI - Cotiviti - DUA - AS - 07202022 - Dynamics_69010.pdf
✅ Extracted 9 chunks.

📄 Processing 9/12: contract-docs/60499/Dynamics_-_60499_signed.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (972 > 512). Running this sequence through the model will result in indexing errors


✅ Extracted 8 chunks.

📄 Processing 10/12: contract-docs/80528/WHC_Pre Show List_02132020_Edifecs signed.pdf
✅ Extracted 2 chunks.

📄 Processing 11/12: contract-docs/79737/3M_VAR Agmt and BAA_08232013_fully executed.pdf
✅ Extracted 59 chunks.

📄 Processing 12/12: contract-docs/66531/Dynamics_-_66531_signed.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (868 > 512). Running this sequence through the model will result in indexing errors


✅ Extracted 7 chunks.

=== Summary ===
Processed: 12  Failed: 0
Pdf: 10  Docx: 1  Txt: 0  Msg: 1  Doc: 0  Ocr: 2  None: 0
[ERROR] Failed to get embedding!
[ERROR] Failed to get embedding!
[ERROR] Failed to get embedding!
[ERROR] Failed to get embedding!
Uploading batch 1 of 1

 ✅ Successfully uploaded to token-chunking S3 vector Index


In [12]:
#check number of chunks uploaded

INDEX_NAME = "token-chunking"
# INDEX_NAME = "overlap-chunking"
# INDEX_NAME = "semantic-split-chunking"
response = s3v.list_vectors(
    vectorBucketName=VECTOR_BUCKET_NAME,
    indexName=INDEX_NAME,
    returnData=True,
    returnMetadata=True
)

vectors = response.get("vectors", [])
print(len(vectors))

# for vector in vectors:
#     print(f"Key: {vector['key']}")
#     print(f"Metadata: {vector.get('metadata',{})}")
#     print(f"Embedding (first 5 dims): {vector['data']['float32'][:5]}")
#     print("--------------")

# next_token = response.get("nextToken")
# if next_token:
#     pass


177


In [None]:
def query_s3_vector_store(query_text, client_account_filter, INDEX_NAME, top_k = 5):
    print(f"\n--- Processing Query: '{query_text}' ---")

    query_embedding = get_text_embedding(query_text)
    filter_expression = None
    if client_account_filter is not None:
        filter_expression = {
            "client_account": {
                "$eq": client_account_filter
            }
        }

    try:
        response = s3v.query_vectors(
            vectorBucketName=VECTOR_BUCKET_NAME,
            indexName=INDEX_NAME,
            topK=top_k,
            queryVector={
                'float32': query_embedding
            },
            returnMetadata=True,
            returnDistance=True,
            filter=filter_expression
        )
        return response
    except Exception as e:
        print(f"Error querying S3 Vector Store: {e}")
        return None

user_questions = [
    "What obligations does Cotiviti have under Schedule C for Prepay FWAV Services?", #66179
    "Under Schedule C, what services is Cotiviti required to provide?", #66179
    "In the Prepay FWAV Services section, what are Cotiviti's main deliverables?", #66179
    "What restrictions are placed on disclosing confidential information?", #67566
    "What is Amendment #4 to the Verisk Health License Agreement about?", #53985
    "What is the purpose of Amendment #4 as stated in the document?", #53985
    
]


INDEX_NAME = "token-chunking"
# INDEX_NAME = "overlap-chunking"
# INDEX_NAME = "semantic-split-chunking"

for question in user_questions:
    query_results = query_s3_vector_store(question, 'UST Global', INDEX_NAME, top_k=5)

    if query_results and 'vectors' in query_results:
        print(f"Retrieved {len(query_results['vectors'])}")
        for i, chunk_data in enumerate(query_results['vectors']):
            print(f"    Metadata {i+1}: {chunk_data['metadata']}")
            # print(f"    Distance: {chunk_data.get('distance', 'N/A')}")
    else:
        print(f"No results or an error occurred for query: '{question}'")



--- Processing Query: 'What obligations does Cotiviti have under Schedule C for Prepay FWAV Services?' ---
Retrieved 5
    S3_Path 1: {'client_account': 'UST Global', 'document_type': 'SOW', 'file_name': 'VERS-0167-01-00 SOW 87 Star Intelligence Development - Staff Aug FULLY EXECUTED.pdf', 's3_path': 's3://ml-legal-restricted/contract-docs/66321/VERS-0167-01-00 SOW 87 Star Intelligence Development - Staff Aug FULLY EXECUTED.pdf', 'text': "Upon commencement of the Services and subject to the terms and conditions of this \nSOW and in accordance with the terms of the Agreement, Vendor shall invoice Cotiviti on \nor before the 5th day of each month in arrears for Services performed in the previous \nmonth, based on the fees described in the fee table above and the hours during which \nServices are performed by each resource type. ii. The monthly bill rate equals the number of Vendor Resources in a role, multiplied by the \napplicable bill rate (from Fee T able in Section 7.a), multiplied 

LLM Integration

test samples from doc

open-searching mirror and querying

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth

region = 'us-east-1'
host = 'g7a6yvmq4wc43rvrzp89.us-east-1.aoss.amazonaws.com'

session = boto3.Session()
credentials = session.get_credentials().get_frozen_credentials()

awsauth = AWS4Auth(
    credentials.access_key,
    credentials.secret_key,
    region,
    'aoss',
    session_token=credentials.token
)

opensearch_client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

index_name = 'token-chunking'  # Use hyphen or underscore, avoid spaces

# Check if index exists, returns True or False
index_exists = opensearch_client.indices.exists(index=index_name)

if index_exists:
    print(f"Index '{index_name}' exists.")
else:
    print(f"Index '{index_name}' does NOT exist.")

def query_opensearch_vector_store(query_text, index_name, embed_model, vector_field='embedding', top_k=5):
    print(f"\n--- Querying OpenSearch: '{query_text}' ---")

    query_vector = embed_model.get_text_embedding(query_text)

    query_body = {
        "size": top_k,
        "query": {
            "knn": {
                "embedding": {
                    "vector": query_vector,
                    "k": top_k
                }
            }
        }
    }

    try:
        response = opensearch_client.search(index=index_name, body=query_body)
        return response
    except Exception as e:
        print(f"Error querying OpenSearch: {e}")
        return None

response = query_opensearch_vector_store("climate change effects", "token-chunking", embed_model)
if response:
    for hit in response["hits"]["hits"]:
        print(f"Score: {hit['_score']}, Metadata: {hit['_source'].get('metadata')}")