In [1]:
# !pip install --upgrade boto3==1.39.8
# !sudo apt install antiword 
import io
import uuid
import os
import random
import nltk
import boto3
import requests
import json
import pytesseract
import textract
from pdf2image import convert_from_bytes
from PyPDF2 import PdfReader
from docx import Document
from io import BytesIO
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModel
import extract_msg
import tempfile
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.schema import TextNode, NodeRelationship, RelatedNodeInfo
from sentence_transformers import SentenceTransformer

# embed_model = AutoModel.from_pretrained("intfloat/e5-small-v2")

S3_BUCKET = "ml-legal-restricted"
EXCEL_PATH = "full_contracts_with_files.xlsx"
VECTOR_BUCKET_NAME = "legal-docs-vector-store"
EMBEDINNGS_URL = "https://zgggzg2iqg.execute-api.us-east-1.amazonaws.com/dev/get_embeddings"
API_KEY = "2jIpWCyNRg3Y8lkbmWG0tkyXwYlJn5QaZ1F3yKf7"

nltk.download('punkt')
nltk.download('punkt_tab')
s3 = boto3.client('s3')
s3v = boto3.client("s3vectors", region_name="us-east-1")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def download_excel(bucket, key, sheet_name="Active Legal Contracts", column="Contract Number"):
    print(f"📥 Downloading Excel: s3://{bucket}/{key}")
    obj = s3.get_object(Bucket=bucket, Key=key)
    excel_data = obj['Body'].read()

    df = pd.read_excel(io.BytesIO(excel_data), sheet_name=sheet_name, engine='openpyxl')
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in sheet '{sheet_name}'")

    contract_numbers = df[column].dropna().astype(str).str.strip().tolist()
    return contract_numbers, df, excel_data


def list_s3_files_for_contract(bucket, contract_number, prefix_base="contract-docs/"):
    prefix = f"{prefix_base}{contract_number}/"
    files = []
    paginator = s3.get_paginator("list_objects_v2")

    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            files.append(obj["Key"])
    return files


def add_modified_sheet_with_files(wb, original_df, file_map):
    ws = wb.create_sheet("Active Legal Contracts + Files")
    headers = original_df.columns.tolist()
    max_files = max((len(files) for files in file_map.values()), default=0)
    headers += [f"File {i+1}" for i in range(max_files)]
    ws.append(headers)

    for idx, row in original_df.iterrows():
        contract_number = str(row["Contract Number"]).strip()
        files = [os.path.basename(f) for f in file_map.get(contract_number, [])]
        base_row = row.tolist()
        ws.append(base_row + files)


def add_s3_paths_sheet(wb, file_map, bucket):
    ws = wb.create_sheet("S3 File Paths")
    max_files = max((len(files) for files in file_map.values()), default=0)

    headers = ["Contract Number"] + [f"S3 File {i+1}" for i in range(max_files)]
    ws.append(headers)

    for contract, keys in file_map.items():
        s3_paths = [f"s3://{bucket}/{key}" for key in keys]
        ws.append([contract] + s3_paths)

def save_missing_contracts_to_csv(missing_contracts, output_path="missing_contracts.csv"):
    if not missing_contracts:
        print("✅ No missing contracts to save.")
        return

    df = pd.DataFrame(missing_contracts, columns=["Contract Number"])
    df.to_csv(output_path, index=False)
    print(f"📄 Missing contracts CSV saved to: {output_path}")
    
def process_entire_bucket(bucket, excel_key, output_path="full_contracts_with_files.xlsx"):
    contract_numbers, original_df, excel_bytes = download_excel(bucket, excel_key)

    file_map = {}
    missing_contracts = []

    print(f"\n🔍 Scanning {len(contract_numbers)} contract numbers across S3...")
    for idx, number in enumerate(contract_numbers, 1):
        if idx % 1000 == 0 or idx == 1:
            print(f"🔢 [{idx}/{len(contract_numbers)}] Scanning: {number}")
        files = list_s3_files_for_contract(bucket, number)

        if not files and len(number) < 8 and number.isdigit():
            padded = number.zfill(8)
            print(f"   ➕ Retrying with padded contract number: {padded}")
            files = list_s3_files_for_contract(bucket, padded)
            
        if files:
            file_map[number] = files
        else:
            missing_contracts.append(number)

    print("\n🧾 Preparing final Excel workbook...")
    wb = load_workbook(io.BytesIO(excel_bytes))
    add_modified_sheet_with_files(wb, original_df, file_map)
    add_s3_paths_sheet(wb, file_map, bucket)
    wb.save(output_path)
    print(f"✅ Final Excel saved: {output_path}")

    print("\n=== Final Summary ===")
    print(f"📄 Total contracts processed: {len(contract_numbers)}")
    print(f"✅ Contracts with files: {len(file_map)}")
    print(f"❌ Contracts with NO files found: {len(missing_contracts)}")
    if missing_contracts:
        print(f"🔍 Sample missing contract numbers: {missing_contracts[:5]}{'...' if len(missing_contracts) > 5 else ''}")
        save_missing_contracts_to_csv(missing_contracts)

if __name__ == "__main__":
    bucket = "ml-legal-restricted"
    excel_key = "tabularData/Active Legal Contracts 7-10-2025 1-17-09 PM.xlsx"
    process_entire_bucket(bucket, excel_key, output_path="full_contracts_with_files.xlsx")


In [17]:
import numpy as np
def get_text_embedding(texts, model='e5_mistral_embed_384'):
    if isinstance(texts, str):
        texts = [texts]
        
    if not isinstance(texts, list) or not texts:
        raise ValueError("Input 'texts' must be a non-empty list of strings.")

    embeddings = []

    for text in texts:
        if not isinstance(text, str):
            raise ValueError("Each item in 'texts' must be a string.")

        payload = {
            "model_name": model,
            "texts": [text]
        }

        headers = {
        "x-api-key": API_KEY
        }

        try:
            response = requests.post(EMBEDINNGS_URL, json=payload, headers=headers)
            response.raise_for_status()

            raw_body = response.json().get('body')

            parsed_body = json.loads(raw_body)

            embedding = parsed_body.get('embeddings')
            if not embedding or not isinstance(embedding, list) or len(embedding) != 1:
                raise KeyError(f"No valid embedding found in response for text: '{text}'")

            # embeddings.append(embedding[0])
            embedding_float32 = np.array(embedding[0], dtype=np.float32).tolist()
            embeddings.append(embedding_float32)
        except Exception as e:
            print(f"[ERROR] Failed to get embedding!") #for '{text}': {e}")
            embeddings.append(None)

    return embeddings[0] if len(embeddings) == 1 else embeddings


In [13]:
# to recreate vector index store

# INDEX_NAME = "token-chunking"
# INDEX_NAME = "overlap-chunking"
# INDEX_NAME = "semantic-split-chunking"
INDEX_NAME = 'token-chunking-poc'
VECTOR_DIM = 384
DISTANCE_METRIC = "cosine"
NON_FILTERABLE_KEYS = ['text']

response = s3v.delete_index(
    vectorBucketName=VECTOR_BUCKET_NAME,
    indexName=INDEX_NAME,
)

response = s3v.create_index(
    vectorBucketName=VECTOR_BUCKET_NAME,
    indexName=INDEX_NAME,
    dataType="float32",
    dimension=VECTOR_DIM,
    distanceMetric=DISTANCE_METRIC,
    metadataConfiguration={
        "nonFilterableMetadataKeys": NON_FILTERABLE_KEYS
    }
)

print(f"Created index: {response}")


Created index: {'ResponseMetadata': {'RequestId': '07311431-3181-46c0-b4e3-d53fcbe4c85f', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Fri, 25 Jul 2025 01:33:49 GMT', 'content-type': 'application/json', 'content-length': '2', 'connection': 'keep-alive', 'x-amz-request-id': '07311431-3181-46c0-b4e3-d53fcbe4c85f', 'access-control-allow-origin': '*', 'vary': 'origin, access-control-request-method, access-control-request-headers', 'access-control-expose-headers': '*'}, 'RetryAttempts': 0}}


In [5]:
def chunk_text(text, metadata, token_limit=400, tokenizer_name="intfloat/e5-small-v2"):
    
    s3_path = metadata.get("s3_path", "")
    file_name = os.path.basename(s3_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    sentences = sent_tokenize(text)
    chunks, current, count = [], [], 0

    for sent in sentences:
        toks = tokenizer.tokenize(sent)
        if count + len(toks) > token_limit and current:
            chunks.append(" ".join(current))
            current, count = [], 0
        current.append(sent)
        count += len(toks)

    if current:
        chunks.append(" ".join(current))

    return [
        {
            "key": str(uuid.uuid4()),
            "metadata": {
                "text": chunk,
                "file_name": file_name,
                "s3_path": s3_path,
                "client_account": metadata.get("client_account"),
                "document_type": metadata.get("document_type")
            }
        }
        for chunk in chunks
    ]
    

def chunk_text_with_overlap(text, metadata, token_limit=400, chunk_overlap=80, tokenizer_name="intfloat/e5-small-v2"):
    
    s3_path = metadata.get("s3_path", "")
    file_name = os.path.basename(s3_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    sentences, chunks, curr, curr_toks = sent_tokenize(text), [], [], 0

    for sent in sentences:
        stoks = len(tokenizer.tokenize(sent))
        if stoks > token_limit:
            words = sent.split()
            i = 0
            while i < len(words):
                segment = words[i:i+token_limit]
                chunks.append(" ".join(segment))
                i += token_limit - chunk_overlap
            continue

        if curr_toks + stoks <= token_limit:
            curr.append(sent); curr_toks += stoks
        else:
            chunks.append(" ".join(curr))
            # build overlap
            overlap, tot = [], 0
            for s in reversed(curr):
                l = len(tokenizer.tokenize(s))
                if tot + l > chunk_overlap:
                    break
                overlap.insert(0, s); tot += l
            curr = overlap + [sent]
            curr_toks = sum(len(tokenizer.tokenize(s)) for s in curr)

    if curr:
        chunks.append(" ".join(curr))
    
    return [
        {
            "key": str(uuid.uuid4()),
            "metadata": {
                "text": chunk,
                "file_name": file_name,
                "s3_path": s3_path,
                "client_account": metadata.get("client_account"),
                "document_type": metadata.get("document_type")
            }
        }
        for chunk in chunks
    ]

#TODO ebedding model integgratiom
def chunk_with_semantic_split(text, data, buffer_size=1, breakpoint_percentile_threshold=95):

    from llama_index.core import Document

    s3_path = data.get("s3_path", "")
    metadata = {
        "s3_path": s3_path,
        "file_name": os.path.basename(s3_path),
        "client_account": data.get("client_account"),
        "document_type": data.get("document_type")
                }
    doc = Document(text=text, metadata=metadata)

    parser = SemanticSplitterNodeParser.from_defaults(
        embed_model=embed_model, #need to fix this
        buffer_size=buffer_size,
        breakpoint_percentile_threshold=breakpoint_percentile_threshold,
        include_metadata=False,
        include_prev_next_rel=False
    )

    nodes = parser.get_nodes_from_documents([doc])
    texts = [n.text for n in nodes]

    result = []
    for n in nodes:
        unique_id = str(uuid.uuid4())
        result.append({
            "key": unique_id,
            "metadata": {
                "text": n.text,
                "s3_path": s3_path,
                "file_name": os.path.basename(s3_path)
            },
        })

    return result

In [25]:
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt", ".msg", ".doc"}

def list_supported_files(bucket, prefix=""):
    paginator = s3.get_paginator("list_objects_v2")
    all_files, supported, unsupported = [], [], []

    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            key = obj["Key"]
            all_files.append(key)
            ext = os.path.splitext(key)[1].lower()
            if ext in SUPPORTED_EXTENSIONS:
                supported.append(key)
            else:
                unsupported.append(key)

    print("\n=== File Summary ===")
    print(f"📁 Total files: {len(all_files)}")
    print(f"✅ Supported files: {len(supported)}")
    print(f"❌ Unsupported: {len(unsupported)} (Sample: {unsupported[:5]})\n")
    return supported

def download_s3_file(bucket, key):
    response = s3.get_object(Bucket=bucket, Key=key)
    return io.BytesIO(response["Body"].read())

def extract_from_doc(file_io):
    try:
        file_io.seek(0)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as temp_file:
            temp_file.write(file_io.read())
            temp_path = temp_file.name

        text = textract.process(temp_path).decode("utf-8").strip()
        os.remove(temp_path)
        return text
    except Exception as e:
        print(f"❗ DOC (textract) error: {e}")
        return ""


def extract_with_ocr(file_io):
    try:
        file_io.seek(0)
        images = convert_from_bytes(file_io.read())
        return "\n".join(pytesseract.image_to_string(img) for img in images).strip()
    except Exception as e:
        print(f"❗ OCR failed: {e}")
        return ""

def extract_from_pdf(file_io):
    try:
        reader = PdfReader(file_io)
        return "\n".join([p.extract_text() for p in reader.pages if p.extract_text()]).strip()
    except Exception as e:
        print(f"❗ PDF read error: {e}")
        return ""


def extract_from_docx(file_io):
    from docx import Document
    from io import BytesIO

    try:
        if hasattr(file_io, "read"):
            file_io.seek(0)
            content = file_io.read()
            bio = BytesIO(content)
            
            doc = Document(bio)
        else:

            bio = BytesIO(file_io)
            doc = Document(bio)
            
        text = "\n".join(p.text for p in doc.paragraphs).strip()
        return text
    except Exception as e:
        print(f"❗ DOCX error: {e}")
        return ""


def extract_from_txt(file_io):
    try:
        return file_io.read().decode("utf-8").strip()
    except Exception as e:
        print(f"❗ TXT read error: {e}")
        return ""

def extract_from_msg(file_io):
    try:
        with open("temp.msg", "wb") as f:
            f.write(file_io.read())
        msg = extract_msg.Message("temp.msg")
        text = msg.body or ""
        os.remove("temp.msg")
        return text.strip()
    except Exception as e:
        print(f"❗ MSG read error: {e}")
        return ""

def extract_text(file_io, ext):
    ext = ext.lower()
    
    file_io.seek(0)
    sig = file_io.read(4)
    file_io.seek(0)
    
    extractors = {
        ".pdf": [
            extract_from_pdf,
            extract_with_ocr
        ],
        ".docx": [
            extract_from_docx,
            extract_with_ocr
        ],
        ".doc": [extract_from_doc],
        ".txt": [extract_from_txt,],
        ".msg": [extract_from_msg],
    }

    for extractor in extractors.get(ext, []):
        file_io.seek(0)
        text = extractor(file_io)
        if text:
            return text, extractor.__name__

    return "", "none"


def upload_chunks_to_s3_vector_index(chunks, vector_bucket_name, INDEX_NAME):

    MAX_BATCH_SIZE = 500
    texts = [chunk["metadata"]["text"] for chunk in chunks]
    keys = [chunk["key"] for chunk in chunks]

    embeddings = get_text_embedding(texts)
    if embeddings and isinstance(embeddings[0], float):
        embeddings = [embeddings]

    vectors = []
    for i in range(len(chunks)):
        if embeddings[i] == None:
            continue
        vector_metadata = chunks[i]["metadata"].copy()
        vectors.append({
            "key": keys[i],
            "data": {"float32": embeddings[i]},
            "metadata": vector_metadata
        })

    responses = []
    for i in range(0, len(vectors), MAX_BATCH_SIZE):
        # print(f"Uploading batch {i // MAX_BATCH_SIZE + 1} of {((len(vectors) - 1) // MAX_BATCH_SIZE + 1)}")
        batch = vectors[i:i + MAX_BATCH_SIZE]
        response = s3v.put_vectors(
            vectorBucketName=vector_bucket_name,
            indexName=INDEX_NAME,
            vectors=batch
        )
        responses.append(response)

    return responses


def mark_csv_file_processed(file_key, bucket="ml-legal-restricted", csv_path="gathered_contract_files.csv"):
    full_s3_path = f"s3://{bucket}/{file_key}"

    df = pd.read_csv(csv_path)

    match_idx = df[df['S3_full_path'] == full_s3_path].index

    if not match_idx.empty:
        df.loc[match_idx, 'S3_vectors'] = True
        df.to_csv(csv_path, index=False)
        return True
    return False


def process_documents(bucket, items, INDEX_NAME):
    stats = {
        "processed": 0, "failed": 0,
        "pdf": 0, "docx": 0, "txt": 0, "msg": 0, "doc": 0,
        "ocr": 0, "none": 0
    }

    all_chunks = []

    for idx, item  in enumerate(items, 1):
        file_key = item["file_key"]
        metadata = item.get("metadata", {}).copy()

        print(f"\n📄 Processing {idx}/{len(items)}: {file_key}")
        ext = os.path.splitext(file_key)[1].lower()
        metadata["s3_path"] = f"s3://{bucket}/{file_key}"

        try:
            file_io = download_s3_file(bucket, file_key)
            text, method = extract_text(file_io, ext)

            if not text:
                print("⚠️ No text extracted.")
                stats["failed"] += 1
                stats["none"] += 1
                continue
            
            if INDEX_NAME == 'token-chunking':
                chunks = chunk_text(text, metadata)
            elif INDEX_NAME == 'overlap-chunking':
                chunks = chunk_text_with_overlap(text, metadata)
            elif INDEX_NAME == 'semantic-split-chunking':
                chunks = chunk_with_semantic_split(text, metadata)
            else:
                chunks = chunk_text(text, metadata)

            # sample = chunks[0]['text'][:80].replace('\n', ' ')
            # sample = chunks
            # print(f"📝 Sample: {sample}...")

            # all_chunks.extend(chunks)
            response = upload_chunks_to_s3_vector_index(chunks, VECTOR_BUCKET_NAME, INDEX_NAME)
            if response:
                marked = mark_csv_file_processed(file_key)
                if marked:
                    print(f"✅ Extracted {len(chunks)} and Successfully uploaded to S3 vector Index and Marked in CSV")
            else:
                print(f"❗Error: Extracted {len(chunks)} and unable to upload to S3 vector Index")
            
            stats["processed"] += 1
            stats[ext.replace(".", "")] += 1
            if "ocr" in method: stats["ocr"] += 1

        except Exception as e:
            print(f"❗ Error: {e}")
            stats["failed"] += 1
            stats["none"] += 1

    print("\n=== Summary ===")
    first_keys = ['processed', 'failed']
    first_line = []
    second_line = []

    for k, v in stats.items():
        key_formatted = k.capitalize().replace('_', ' ')
        pair = f"{key_formatted}: {v}"
        if k.lower() in first_keys:
            first_line.append(pair)
        else:
            second_line.append(pair)

    print("  ".join(first_line))
    print("  ".join(second_line))

    return stats

def read_s3_keys_from_excel(excel_path, sample_size, sheet_name="S3 File Paths"):
    print(f"📥 Reading S3 paths from Excel: {excel_path}, sheet: {sheet_name}")
    df = pd.read_excel(excel_path, sheet_name=sheet_name, engine="openpyxl")

    file_columns = [col for col in df.columns if col.startswith("S3 File")]

    if not file_columns:
        raise ValueError(f"No columns starting with 'S3 File' found in sheet '{sheet_name}'")

    all_paths = []
    for col in file_columns:
        for cell in df[col].dropna():
            path = str(cell).strip()
            if path:
                all_paths.append(path)

    print(f"🔍 Total paths found: {len(all_paths)}")

    valid_keys = []
    invalid_paths = []

    for path in all_paths:
        if path.startswith("s3://"):
            parts = path.replace("s3://", "").split("/", 1)
            if len(parts) == 2 and parts[1].strip():
                valid_keys.append(parts[1])
            else:
                invalid_paths.append(path)
        else:
            invalid_paths.append(path)

    print(f"📁 Parsed S3 keys: {len(valid_keys)}")

    if invalid_paths:
        print(f"⚠️ Skipped invalid paths: {len(invalid_paths)}")
        print("\n⚠️ Skipped Paths (sample):")
        for bad in invalid_paths[:10]:
            print(f" - {bad}")
        if len(invalid_paths) > 10:
            print(f" ...and {len(invalid_paths) - 10} more.")

    # selected_keys = random.sample(valid_keys, min(sample_size, len(valid_keys)))
    # print(f"🎯 Randomly selected {len(selected_keys)} files.")
    # return selected_keys

    print(f"📄 Reading metadata from sheet: 'Active Legal Contracts'")
    df_meta = pd.read_excel(excel_path, sheet_name="Active Legal Contracts", engine="openpyxl")
    meta_map = df_meta.set_index('Contract Number')[['Account', 'Document Type']].to_dict('index')

    result = []
    for key in valid_keys:
        parts = key.split("/")
        contract_number = parts[1] if len(parts) > 1 else None
        metadata = meta_map.get(int(contract_number), {}) if contract_number and contract_number.isdigit() else {}

        result.append({
            'file_key': key,
            'metadata': {
                'client_account': metadata.get('Account', 'NA'),
                'document_type': metadata.get('Document Type', 'NA')
            }
        })

    return result

def extract_csv_file_metadata(csv_path):
    df = pd.read_csv(csv_path)

    #incase we want to resume where we left off
    df['S3_vectors'] = df['S3_vectors'].astype(str).str.lower()
    df_filtered = df[df['S3_vectors'] != 'true']
    print(f"✅ Rows included after filtering (S3_vectors != True): {len(df_filtered)}")

    results = []

    for _, row in df_filtered.iterrows():
        file_key = row['S3_full_path'].replace(f"s3://{row['S3_full_path'].split('/')[2]}/", "")
        metadata = {
            'client_account': row.get('client_account', 'NA'),
            'document_type': row.get('doc_type', 'NA')
        }

        results.append({
            'file_key': file_key,
            'metadata': metadata
        })

    return results

if __name__ == "__main__":

    # file_keys_with_meta = read_s3_keys_from_excel(EXCEL_PATH, sample_size=20)
    INDEX_NAME = "token-chunking-poc"

    file_keys_with_meta = extract_csv_file_metadata("gathered_contract_files.csv")
    
    final_stats = process_documents(S3_BUCKET, file_keys_with_meta, INDEX_NAME)


✅ Rows included after filtering (S3_vectors != True): 48

📄 Processing 1/48: contract-docs/52910/1199 ASP Upsell Financial Summary.xlsx
⚠️ No text extracted.

📄 Processing 2/48: contract-docs/76438/3M License Agreement_Lorica Health_Codefinder_GPCS_ 29th March 2023 to 28th March 2026_Final.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (836 > 512). Running this sequence through the model will result in indexing errors


[ERROR] Failed to get embedding!
❗ Error: 'NoneType' object is not subscriptable

📄 Processing 3/48: contract-docs/61807/BAA to Master Software and Service Agreement (Exhibit A).pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (908 > 512). Running this sequence through the model will result in indexing errors


[ERROR] Failed to get embedding!
❗ Error: 'NoneType' object is not subscriptable

📄 Processing 4/48: contract-docs/60590/Coviti_CTR55948-20_NEW_3_Year_v6_MJG_9.8.2020_EK.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (908 > 512). Running this sequence through the model will result in indexing errors


[ERROR] Failed to get embedding!
❗ Error: 'NoneType' object is not subscriptable

📄 Processing 5/48: contract-docs/58822/AAH.05 Professional Medical Coding Instructor License Agreement between Cotiviti and AAPC(712376.3).docx
❗ DOCX error: "There is no item named 'customXML/item3.xml' in the archive"
❗ OCR failed: Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

⚠️ No text extracted.

📄 Processing 6/48: contract-docs/59807/EXTERNAL EMAIL - USE CAUTION FW Cotiviti - AArete FFP SOW Termination Notice.htm
⚠️ No text extracted.

📄 Processing 7/48: contract-docs/51772/Verscend PAF 6-05-18 (AArete).xlsx
⚠️ No text extracted.

📄 Processing 8/48: contract-docs/53624/AK Steel - Verscend - NDA - JSW - 082018 - Executed DX 53624.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors


[ERROR] Failed to get embedding!
❗ Error: 'NoneType' object is not subscriptable

📄 Processing 9/48: contract-docs/53624/AK Steel Verscend NDA jsw 8.16.18 FINAL_signed.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors


[ERROR] Failed to get embedding!
❗ Error: 'NoneType' object is not subscriptable

📄 Processing 10/48: contract-docs/58512/ASA_2021 Crosswalk eFile_Quote_.docx
❗ OCR failed: Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

⚠️ No text extracted.

📄 Processing 11/48: contract-docs/58512/ASA_3PCompliance.docx
❗ OCR failed: Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

⚠️ No text extracted.

📄 Processing 12/48: contract-docs/58512/ASA_3PSecurity.docx
❗ OCR failed: Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

⚠️ No text extracted.

📄 Processing 13/48: contract-docs/80476/ATT_BA Terms_Final_07122012_Executed.tif
⚠️ No text extracted.

📄 Processing 14/48: cont

Token indices sequence length is longer than the specified maximum sequence length for this model (5094 > 512). Running this sequence through the model will result in indexing errors


[ERROR] Failed to get embedding!
❗ Error: 'NoneType' object is not subscriptable

📄 Processing 18/48: contract-docs/69061/Accurate PUR Review 2022.07.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (5094 > 512). Running this sequence through the model will result in indexing errors


[ERROR] Failed to get embedding!
❗ Error: 'NoneType' object is not subscriptable

📄 Processing 19/48: contract-docs/72101/Cotiviti_Proposal_2023-0509_Cotiviti Edits_v2.pptx
⚠️ No text extracted.

📄 Processing 20/48: contract-docs/56974/Actian - Cotiviti - Q - Actian Corporation (subscription renewal for two years) 6.30.19 - SC - 06272019 - 56974.pdf'
⚠️ No text extracted.

📄 Processing 21/48: contract-docs/56911/Actian - Cotiviti - Name Change Amendment - SC - 06272019 - 56911.pdf'
⚠️ No text extracted.

📄 Processing 22/48: contract-docs/78744/_CPIG_FINAL_Short_Form_Findings for Birch Grove (R-03129) - Feb 2025.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (30982 > 512). Running this sequence through the model will result in indexing errors


[ERROR] Failed to get embedding!
❗ Error: 'NoneType' object is not subscriptable

📄 Processing 23/48: contract-docs/53757/ACG 3rd Fl data-2  PAF 090518.xlsx
⚠️ No text extracted.

📄 Processing 24/48: contract-docs/52628/Draper Parking Lights PAF 062218.xlsx
⚠️ No text extracted.

📄 Processing 25/48: contract-docs/52627/SoJo 3rd Fl data runs PAF 062218.xlsx
⚠️ No text extracted.

📄 Processing 26/48: contract-docs/51879/Verscend PAF_ACG_Fiber_Draper_3.22.18.xlsx
⚠️ No text extracted.

📄 Processing 27/48: contract-docs/51901/ACG Audio Visual Relocation PAF040418.xlsx
⚠️ No text extracted.

📄 Processing 28/48: contract-docs/51883/Verscend PAF_ACG_CableManagement_3.28.18.xlsx
⚠️ No text extracted.

📄 Processing 29/48: contract-docs/51761/Verscend PAF_ACG_Fiber_3.14.18.xlsx
⚠️ No text extracted.

📄 Processing 30/48: contract-docs/74838/DM-BR Teams Exchange 8.22.24.png
⚠️ No text extracted.

📄 Processing 31/48: contract-docs/53934/Verscend PAF Check paper User Form XL v18 9-16-18.xlsx
⚠️ No t

In [None]:
INDEX_NAME = 'token-chunking-poc'

paginator = s3v.get_paginator('list_vectors')

total_vectors_count = 0

page_iterator = paginator.paginate(
    vectorBucketName=VECTOR_BUCKET_NAME,
    indexName=INDEX_NAME,
    returnData=True,
    returnMetadata=True,
    PaginationConfig={
        'PageSize': 1000  
    }
)

for page in page_iterator:
    vectors = page.get('vectors', [])
    total_vectors_count += len(vectors)

print(f"Total chunks vector & stored: {total_vectors_count}")

#17374

Total chunks vector & stored: 17598


In [14]:
def query_s3_vector_store(query_text, client_account_filter, INDEX_NAME, top_k = 5):
    print(f"\n--- Processing Query: '{query_text}' ---")

    query_embedding = get_text_embedding(query_text)
    filter_expression = None
    if client_account_filter is not None:
        filter_expression = {
            "client_account": {
                "$eq": client_account_filter
            }
        }

    try:
        response = s3v.query_vectors(
            vectorBucketName=VECTOR_BUCKET_NAME,
            indexName=INDEX_NAME,
            topK=top_k,
            queryVector={
                'float32': query_embedding
            },
            returnMetadata=True,
            returnDistance=True,
            filter=filter_expression
        )
        return response
    except Exception as e:
        print(f"Error querying S3 Vector Store: {e}")
        return None

user_questions = [
    "What obligations does Cotiviti have under Schedule C for Prepay FWAV Services?", #66179
    "Under Schedule C, what services is Cotiviti required to provide?", #66179
    "In the Prepay FWAV Services section, what are Cotiviti's main deliverables?", #66179
    "What restrictions are placed on disclosing confidential information?", #67566
    "What is Amendment #4 to the Verisk Health License Agreement about?", #53985
    "What is the purpose of Amendment #4 as stated in the document?", #53985
]

INDEX_NAME = "token-chunking-poc"
client = None #'UST Global'

for question in user_questions:
    query_results = query_s3_vector_store(question, client, INDEX_NAME, top_k=5)

    if query_results and 'vectors' in query_results:
        print(f"Retrieved {len(query_results['vectors'])}")
        for i, chunk_data in enumerate(query_results['vectors']):
            print(f"    Metadata {i+1}: {chunk_data['metadata']}")
            # print(f"    Distance: {chunk_data.get('distance', 'N/A')}")
    else:
        print(f"No results or an error occurred for query: '{question}'")



--- Processing Query: 'What obligations does Cotiviti have under Schedule C for Prepay FWAV Services?' ---
Retrieved 0

--- Processing Query: 'Under Schedule C, what services is Cotiviti required to provide?' ---
Retrieved 0

--- Processing Query: 'In the Prepay FWAV Services section, what are Cotiviti's main deliverables?' ---
Retrieved 0

--- Processing Query: 'What restrictions are placed on disclosing confidential information?' ---
Retrieved 0

--- Processing Query: 'What is Amendment #4 to the Verisk Health License Agreement about?' ---
Retrieved 0

--- Processing Query: 'What is the purpose of Amendment #4 as stated in the document?' ---
Retrieved 0


LLM Integration

In [None]:
from openai import AzureOpenAI
import time

AZURE_OPENAI_ENDPOINT = "https://ironclad-openai-001.openai.azure.com/"
AZURE_OPENAI_API_KEY = "936856630b764210913d9a8fd6c8212b"
AZURE_DEPLOYMENT_NAME = "gpt-4o"

INDEX_NAME = "token-chunking-poc"
client_name = None

def load_azure_client():
    return AzureOpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        api_version="2023-05-15",
        azure_endpoint=AZURE_OPENAI_ENDPOINT
    )

azure_client = load_azure_client()

def build_prompt(query, top_chunks):
    context = ""
    source_refs = {}
    chunks = top_chunks['vectors']
    for i, chunk in enumerate(chunks):
        ref = f"[{i+1}]"
        metadata = chunk.get("metadata", {})
        source = metadata.get("s3_path", "unknown")
        chunk_text = metadata.get("text", "")
        
        context += f"{ref} ({source}):\n{chunk_text}\n\n"
        source_refs[ref] = metadata

    prompt = f"""You are a helpful assistant. Use only the following context to answer the question.
Cite sources using [1], [2], etc., based only on the exact chunks below. Do not make up citations. Do not include sources not explicitly mentioned.

Context:
{context}

Question: {query}

Answer:"""
    return prompt, source_refs

def run_query_pipeline(query, top_k=5):
    print(f"\n🔍 Running RAG query for: {query}\n")
    start = time.time()

    chunks = query_s3_vector_store(query, client_name, INDEX_NAME, top_k=top_k)

    if not chunks:
        print("❗ No chunks returned from vector store.")
        return None, {}, 0, []

    prompt, refs = build_prompt(query, chunks)

    response = azure_client.chat.completions.create(
        model=AZURE_DEPLOYMENT_NAME,
        messages=[
            {
                "role": "system",
                "content": "You are a helpful assistant. Use only the following context to answer the question. "
                           "Cite sources using [1], [2], etc., based only on the exact chunks below. "
                           "Do not make up citations. Do not include sources not explicitly mentioned."
            },
            {"role": "user", "content": prompt}
        ]
    )

    answer = response.choices[0].message.content.strip()
    latency = round((time.time() - start) * 1000, 2)

    print(f"\nAnswer:\n{answer}\n")
    print(f"Latency: {latency} ms")
    print(f"\nTop {top_k} Chunks Returned:")
    chunks = chunks['vectors']
    for i, chunk in enumerate(chunks, 1):
        meta = chunk.get("metadata", {})
        print(f"\n--- Chunk [{i}] ---")
        print(f"Text: {meta.get('text', '')[:300]}{'...' if len(meta.get('text', '')) > 300 else ''}")
        print("Metadata:")
        for k, v in meta.items():
            if k != "text":
                print(f"  {k}: {v}")

    return answer, refs, latency, chunks


def pretty_print_rag_output(answer, refs, latency, chunks):
    print("\n=== Final RAG Output ===")
    
    print("\nAnswer:\n" + "-"*60)
    print(answer)

    print("\nSource References:\n" + "-"*60)
    for ref, meta in refs.items():
        print(f"{ref}:")
        for key, value in meta.items():
            if key == "text":
                print(f"  text: {value[:200].strip()}..." if len(value) > 200 else f"  text: {value.strip()}")
            else:
                print(f"  {key}: {value}")

    print("\nLatency:\n" + "-"*60)
    print(f"{latency} ms")

    print("\nChunks:\n" + "-"*60)
    for i, chunk in enumerate(chunks, 1):
        meta = chunk.get("metadata", {})
        print(f"Chunk [{i}]:")
        print(f"  file_name: {meta.get('file_name')}")
        print(f"  client_account: {meta.get('client_account')}")
        print(f"  document_type: {meta.get('document_type')}")
        print(f"  s3_path: {meta.get('s3_path')}")
        print("  text preview:")
        text_preview = meta.get("text", "")
        print("    " + text_preview[:300].replace("\n", " ") + ("..." if len(text_preview) > 300 else ""))
        print("-"*60)

query = "Can we use client data to develop or test new services, and enhance, improve, or modify our existing services?"

answer, refs, latency, chunks = run_query_pipeline(query, top_k=5)
pretty_print_rag_output(answer, refs, latency, chunks)

#Batch RUN

In [8]:
def run_batch_rag_queries(queries, top_k=5):
    all_results = []

    for i, query in enumerate(queries, 1):
        print(f"\nRunning query {i}/{len(queries)}: \"{query}\"")
        answer, refs, latency, chunks = run_query_pipeline(query, top_k=top_k)

        result_entry = {
            "query": query,
            "answer": answer,
            "latency_ms": latency,
            "refs": refs,
            "chunks": chunks
        }

        all_results.append(result_entry)
    
    return all_results

def write_rag_results_to_file(results, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for i, result in enumerate(results, 1):
            f.write(f"\n=== Query {i} ===\n")
            f.write(f"Query: {result['query']}\n")
            f.write(f"Latency: {result['latency_ms']} ms\n")

            f.write("\nAnswer:\n")
            f.write(result["answer"] + "\n")

            f.write("\nSource References:\n")
            for ref, meta in result["refs"].items():
                f.write(f"{ref}:\n")
                for k, v in meta.items():
                    if k == "text":
                        preview = v[:200].strip().replace("\n", " ")
                        f.write(f"  text: {preview}...\n" if len(v) > 200 else f"  text: {preview}\n")
                    else:
                        f.write(f"  {k}: {v}\n")

            f.write("\nChunks:\n")
            for j, chunk in enumerate(result["chunks"], 1):
                meta = chunk.get("metadata", {})
                f.write(f"Chunk [{j}]:\n")
                f.write(f"  file_name: {meta.get('file_name')}\n")
                f.write(f"  client_account: {meta.get('client_account')}\n")
                f.write(f"  document_type: {meta.get('document_type')}\n")
                f.write(f"  s3_path: {meta.get('s3_path')}\n")
                preview = meta.get("text", "").replace("\n", " ")[:300]
                f.write(f"  text preview: {preview}...\n" if len(preview) == 300 else f"  text preview: {preview}\n")
                f.write("-" * 60 + "\n")
            f.write("\n" + "=" * 80 + "\n")

doc_questions = [
    "Can we use client data (including PHI) to develop or test new services, and enhance, improve, or modify our existing services?",
    "Can we use aggregated client data or de-identified client data for the purposes of developing, testing, enhancing, improving, or modifying services?",
    "Are there any restrictions on using artificial intelligence or machine learning in delivering the services?",
    "Are there any specific terms regarding the use of client data for training and fine-tuning AI models?",
    "Are there any limitations on storing client data (including PHI) in the cloud?",
    "Are there any restrictions on using cloud services to assist in the processing of client data (including PHI)?",
    "Is there any requirement for human oversight or decision-making in the use of AI for delivering the services?",
    "Does the client have any ownership rights in developed IP or developed materials generated from the use of their data?",
    "Are there any specific terms regarding the ownership and usage rights of IP developed through the use of client data?",
    "Are there any restrictions, prohibitions, or notice requirements related to the incorporation of open-source code into any solution?",
    "Are there any obligations to notify the client of material improvements or enhancements to a solution?",
    "Is client approval required for implementing significant changes or upgrades to the services?",
    "Are there any restrictions or requirements related to the use of third-party vendors or subcontractors in the processing of client data?",
    "Are there any clauses requiring explicit client consent for the use of their data in specific ways, such as for R&D purposes or AI training?",
    "Does the client have any rights to audit our data processing practices or AI model training processes?"
]

all_results = run_batch_rag_queries(doc_questions, top_k=5)
output_file="rag_batch_output.txt"
write_rag_results_to_file(all_results, output_file)
print(f"\nResults saved to '{output_file}'")




Running query 1/15: "Can we use client data (including PHI) to develop or test new services, and enhance, improve, or modify our existing services?"

🔍 Running RAG query for: Can we use client data (including PHI) to develop or test new services, and enhance, improve, or modify our existing services?


--- Processing Query: 'Can we use client data (including PHI) to develop or test new services, and enhance, improve, or modify our existing services?' ---

Answer:
Based on the available context, using client data, including PHI (Protected Health Information), to develop or test new services, or enhance, improve, or modify existing services, may be permissible under certain conditions. Specifically, 3M can use PHI for the proper management and administration of its operations or to carry out its legal responsibilities, provided it obtains reasonable assurances from third parties to whom the PHI is disclosed that it will remain confidential, be used or further disclosed only as required 

AWS open-search-S3 vector mirroring and querying

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth

region = 'us-east-1'
host = 'g7a6yvmq4wc43rvrzp89.us-east-1.aoss.amazonaws.com'

session = boto3.Session()
credentials = session.get_credentials().get_frozen_credentials()

awsauth = AWS4Auth(
    credentials.access_key,
    credentials.secret_key,
    region,
    'aoss',
    session_token=credentials.token
)

opensearch_client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

index_name = 'token-chunking'  # Use hyphen or underscore, avoid spaces

# Check if index exists, returns True or False
index_exists = opensearch_client.indices.exists(index=index_name)

if index_exists:
    print(f"Index '{index_name}' exists.")
else:
    print(f"Index '{index_name}' does NOT exist.")

def query_opensearch_vector_store(query_text, index_name, embed_model, vector_field='embedding', top_k=5):
    print(f"\n--- Querying OpenSearch: '{query_text}' ---")

    query_vector = embed_model.get_text_embedding(query_text)

    query_body = {
        "size": top_k,
        "query": {
            "knn": {
                "embedding": {
                    "vector": query_vector,
                    "k": top_k
                }
            }
        }
    }

    try:
        response = opensearch_client.search(index=index_name, body=query_body)
        return response
    except Exception as e:
        print(f"Error querying OpenSearch: {e}")
        return None

response = query_opensearch_vector_store("climate change effects", "token-chunking", embed_model)
if response:
    for hit in response["hits"]["hits"]:
        print(f"Score: {hit['_score']}, Metadata: {hit['_source'].get('metadata')}")