In [None]:
!pip install --upgrade boto3==1.39.8
# !sudo apt install antiword 
import io
import uuid
import os
import random
import nltk
import boto3
import pytesseract
import textract
from pdf2image import convert_from_bytes
from PyPDF2 import PdfReader
from docx import Document
from io import BytesIO
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer
import extract_msg
import tempfile
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.schema import TextNode, NodeRelationship, RelatedNodeInfo
from sentence_transformers import SentenceTransformer

# embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")

S3_BUCKET = "ml-legal-restricted"

nltk.download('punkt')
nltk.download('punkt_tab')
s3 = boto3.client('s3')
s3v = boto3.client("s3vectors", region_name="us-east-1")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def download_excel(bucket, key, sheet_name="Active Legal Contracts", column="Contract Number"):
    print(f"📥 Downloading Excel: s3://{bucket}/{key}")
    obj = s3.get_object(Bucket=bucket, Key=key)
    excel_data = obj['Body'].read()

    df = pd.read_excel(io.BytesIO(excel_data), sheet_name=sheet_name, engine='openpyxl')
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in sheet '{sheet_name}'")

    contract_numbers = df[column].dropna().astype(str).str.strip().tolist()
    return contract_numbers, df, excel_data


def list_s3_files_for_contract(bucket, contract_number, prefix_base="contract-docs/"):
    prefix = f"{prefix_base}{contract_number}/"
    files = []
    paginator = s3.get_paginator("list_objects_v2")

    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            files.append(obj["Key"])
    return files


def add_modified_sheet_with_files(wb, original_df, file_map):
    ws = wb.create_sheet("Active Legal Contracts + Files")
    headers = original_df.columns.tolist()
    max_files = max((len(files) for files in file_map.values()), default=0)
    headers += [f"File {i+1}" for i in range(max_files)]
    ws.append(headers)

    for idx, row in original_df.iterrows():
        contract_number = str(row["Contract Number"]).strip()
        files = [os.path.basename(f) for f in file_map.get(contract_number, [])]
        base_row = row.tolist()
        ws.append(base_row + files)


def add_s3_paths_sheet(wb, file_map, bucket):
    ws = wb.create_sheet("S3 File Paths")
    max_files = max((len(files) for files in file_map.values()), default=0)

    headers = ["Contract Number"] + [f"S3 File {i+1}" for i in range(max_files)]
    ws.append(headers)

    for contract, keys in file_map.items():
        s3_paths = [f"s3://{bucket}/{key}" for key in keys]
        ws.append([contract] + s3_paths)

def save_missing_contracts_to_csv(missing_contracts, output_path="missing_contracts.csv"):
    if not missing_contracts:
        print("✅ No missing contracts to save.")
        return

    df = pd.DataFrame(missing_contracts, columns=["Contract Number"])
    df.to_csv(output_path, index=False)
    print(f"📄 Missing contracts CSV saved to: {output_path}")
    
def process_entire_bucket(bucket, excel_key, output_path="full_contracts_with_files.xlsx"):
    contract_numbers, original_df, excel_bytes = download_excel(bucket, excel_key)

    file_map = {}
    missing_contracts = []

    print(f"\n🔍 Scanning {len(contract_numbers)} contract numbers across S3...")
    for idx, number in enumerate(contract_numbers, 1):
        if idx % 1000 == 0 or idx == 1:
            print(f"🔢 [{idx}/{len(contract_numbers)}] Scanning: {number}")
        files = list_s3_files_for_contract(bucket, number)

        if not files and len(number) < 8 and number.isdigit():
            padded = number.zfill(8)
            print(f"   ➕ Retrying with padded contract number: {padded}")
            files = list_s3_files_for_contract(bucket, padded)
            
        if files:
            file_map[number] = files
        else:
            missing_contracts.append(number)

    print("\n🧾 Preparing final Excel workbook...")
    wb = load_workbook(io.BytesIO(excel_bytes))
    add_modified_sheet_with_files(wb, original_df, file_map)
    add_s3_paths_sheet(wb, file_map, bucket)
    wb.save(output_path)
    print(f"✅ Final Excel saved: {output_path}")

    print("\n=== Final Summary ===")
    print(f"📄 Total contracts processed: {len(contract_numbers)}")
    print(f"✅ Contracts with files: {len(file_map)}")
    print(f"❌ Contracts with NO files found: {len(missing_contracts)}")
    if missing_contracts:
        print(f"🔍 Sample missing contract numbers: {missing_contracts[:5]}{'...' if len(missing_contracts) > 5 else ''}")
        save_missing_contracts_to_csv(missing_contracts)

if __name__ == "__main__":
    bucket = "ml-legal-restricted"
    excel_key = "tabularData/Active Legal Contracts 7-10-2025 1-17-09 PM.xlsx"
    process_entire_bucket(bucket, excel_key, output_path="full_contracts_with_files.xlsx")


In [None]:
BUCKET_NAME = "legal-docs-vector-store"
# INDEX_NAME = "token-chunking"
INDEX_NAME = "overlap-chunking"
# INDEX_NAME = "semantic-split-chunking"
VECTOR_DIM = 768
DISTANCE_METRIC = "cosine"
NON_FILTERABLE_KEYS = ['text']

response = s3v.create_index(
    vectorBucketName=BUCKET_NAME,
    indexName=INDEX_NAME,
    dataType="float32",
    dimension=VECTOR_DIM,
    distanceMetric=DISTANCE_METRIC,
    metadataConfiguration={
        "nonFilterableMetadataKeys": NON_FILTERABLE_KEYS
    }
)

print(f"Created index: {response}")


Created index: {'ResponseMetadata': {'RequestId': '3b8fa45d-ab11-4fd6-9013-d9c7911c0d8a', 'HostId': '', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Fri, 18 Jul 2025 20:42:35 GMT', 'content-type': 'application/json', 'content-length': '2', 'connection': 'keep-alive', 'x-amz-request-id': '3b8fa45d-ab11-4fd6-9013-d9c7911c0d8a', 'access-control-allow-origin': '*', 'vary': 'origin, access-control-request-method, access-control-request-headers', 'access-control-expose-headers': '*'}, 'RetryAttempts': 0}}


In [21]:
def _build_nodes(chunks, s3_path, file_name):
    nodes = []
    for chunk in chunks:
        node = TextNode(text=chunk, metadata={"s3_path": s3_path, "file_name": file_name})
        nodes.append(node)

    # link nodes in sequence
    for i, n in enumerate(nodes):
        if i > 0:
            prev = nodes[i-1]
            n.relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(node_id=prev.node_id)
        if i < len(nodes)-1:
            nxt = nodes[i+1]
            n.relationships[NodeRelationship.NEXT] = RelatedNodeInfo(node_id=nxt.node_id)
    return nodes

def chunk_text(text, s3_path, token_limit=500, tokenizer_name="bert-base-uncased"):
    file_name = os.path.basename(s3_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    sentences = sent_tokenize(text)
    chunks, current, count = [], [], 0

    for sent in sentences:
        toks = tokenizer.tokenize(sent)
        if count + len(toks) > token_limit and current:
            chunks.append(" ".join(current))
            current, count = [], 0
        current.append(sent)
        count += len(toks)

    if current:
        chunks.append(" ".join(current))

    return [{"key": str(uuid.uuid4()), "metadata": {"text": chunk, "s3_path": s3_path, "file_name": file_name}} for chunk in chunks]

    nodes = _build_nodes(chunks, s3_path, file_name)
    texts = [n.text for n in nodes]
    embeds = embed_model.get_text_embedding_batch(texts)

    for n, v in zip(nodes, embeds):
        n.embedding = v
    
    return nodes


def chunk_text_with_overlap(text, s3_path, token_limit=500, chunk_overlap=50, tokenizer_name="bert-base-uncased"):
    file_name = os.path.basename(s3_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    sentences, chunks, curr, curr_toks = sent_tokenize(text), [], [], 0

    for sent in sentences:
        stoks = len(tokenizer.tokenize(sent))
        if stoks > token_limit:
            words = sent.split()
            i = 0
            while i < len(words):
                segment = words[i:i+token_limit]
                chunks.append(" ".join(segment))
                i += token_limit - chunk_overlap
            continue

        if curr_toks + stoks <= token_limit:
            curr.append(sent); curr_toks += stoks
        else:
            chunks.append(" ".join(curr))
            # build overlap
            overlap, tot = [], 0
            for s in reversed(curr):
                l = len(tokenizer.tokenize(s))
                if tot + l > chunk_overlap:
                    break
                overlap.insert(0, s); tot += l
            curr = overlap + [sent]
            curr_toks = sum(len(tokenizer.tokenize(s)) for s in curr)

    if curr:
        chunks.append(" ".join(curr))
    
    return [{"key": str(uuid.uuid4()), "metadata": {"text": chunk, "s3_path": s3_path, "file_name": file_name}} for chunk in chunks]

    nodes = _build_nodes(chunks, s3_path, file_name)
    texts = [n.text for n in nodes]
    embeds = embed_model.get_text_embedding_batch(texts)

    for n, v in zip(nodes, embeds):
        n.embedding = v
    
    return nodes

def chunk_with_semantic_split(text, s3_path, 
                              buffer_size=1,
                              breakpoint_percentile_threshold=95):

    from llama_index.core import Document

    metadata = {"s3_path": s3_path, "file_name": os.path.basename(s3_path)}
    doc = Document(text=text, metadata=metadata)

    parser = SemanticSplitterNodeParser.from_defaults(
        embed_model=embed_model,
        buffer_size=buffer_size,
        breakpoint_percentile_threshold=breakpoint_percentile_threshold,
        include_metadata=False,
        include_prev_next_rel=False
    )

    nodes = parser.get_nodes_from_documents([doc])
    texts = [n.text for n in nodes]
    # embeds = embed_model.get_text_embedding_batch(texts)

    result = []
    for n in nodes:
        unique_id = str(uuid.uuid4())
        result.append({
            "key": unique_id,
            "metadata": {
                "text": n.text,
                "s3_path": s3_path,
                "file_name": os.path.basename(s3_path)
            },
        })

    return result

    for n, v in zip(nodes, embeds):
        n.embedding = v
    
    return nodes

In [None]:
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".txt", ".msg", ".doc"}

def list_supported_files(bucket, prefix=""):
    paginator = s3.get_paginator("list_objects_v2")
    all_files, supported, unsupported = [], [], []

    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            key = obj["Key"]
            all_files.append(key)
            ext = os.path.splitext(key)[1].lower()
            if ext in SUPPORTED_EXTENSIONS:
                supported.append(key)
            else:
                unsupported.append(key)

    print("\n=== File Summary ===")
    print(f"📁 Total files: {len(all_files)}")
    print(f"✅ Supported files: {len(supported)}")
    print(f"❌ Unsupported: {len(unsupported)} (Sample: {unsupported[:5]})\n")
    return supported

def download_s3_file(bucket, key):
    response = s3.get_object(Bucket=bucket, Key=key)
    return io.BytesIO(response["Body"].read())

def extract_from_doc(file_io):
    try:
        file_io.seek(0)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".doc") as temp_file:
            temp_file.write(file_io.read())
            temp_path = temp_file.name

        text = textract.process(temp_path).decode("utf-8").strip()
        os.remove(temp_path)
        return text
    except Exception as e:
        print(f"❗ DOC (textract) error: {e}")
        return ""


def extract_with_ocr(file_io):
    try:
        file_io.seek(0)
        images = convert_from_bytes(file_io.read())
        return "\n".join(pytesseract.image_to_string(img) for img in images).strip()
    except Exception as e:
        print(f"❗ OCR failed: {e}")
        return ""

def extract_from_pdf(file_io):
    try:
        reader = PdfReader(file_io)
        return "\n".join([p.extract_text() for p in reader.pages if p.extract_text()]).strip()
    except Exception as e:
        print(f"❗ PDF read error: {e}")
        return ""

# from docx import Document
# from io import BytesIO
# def extract_from_docx(file_io):
#     try:
#         file_io.seek(0)
#         doc = Document(file_io.read())
#         return "\n".join(p.text for p in doc.paragraphs).strip()
#     except Exception as e:
#         print(f"❗ DOCX error: {e}")
#         return ""

from docx import Document
from io import BytesIO

def extract_from_docx(file_io):
    try:
        if hasattr(file_io, "read"):
            file_io.seek(0)
            content = file_io.read()
            bio = BytesIO(content)
            
            doc = Document(bio)
        else:

            bio = BytesIO(file_io)
            doc = Document(bio)
            
        text = "\n".join(p.text for p in doc.paragraphs).strip()
        return text
    except Exception as e:
        print(f"❗ DOCX error: {e}")
        return ""


def extract_from_txt(file_io):
    try:
        return file_io.read().decode("utf-8").strip()
    except Exception as e:
        print(f"❗ TXT read error: {e}")
        return ""

def extract_from_msg(file_io):
    try:
        with open("temp.msg", "wb") as f:
            f.write(file_io.read())
        msg = extract_msg.Message("temp.msg")
        text = msg.body or ""
        os.remove("temp.msg")
        return text.strip()
    except Exception as e:
        print(f"❗ MSG read error: {e}")
        return ""

def extract_text(file_io, ext):
    ext = ext.lower()

    extractors = {
        ".pdf": [
            extract_from_pdf,
            extract_with_ocr
        ],
        ".docx": [
            extract_from_docx,
            extract_with_ocr
        ],
        ".doc": [extract_from_doc],
        ".txt": [extract_from_txt,],
        ".msg": [extract_from_msg],
    }

    for extractor in extractors.get(ext, []):
        file_io.seek(0)
        text = extractor(file_io)
        if text:
            return text, extractor.__name__

    return "", "none"


def upload_chunks_to_s3_vector_index(chunks, embed_model, vector_bucket_name, index_name):

    texts = [chunk["metadata"]["text"] for chunk in chunks]
    keys = [chunk["key"] for chunk in chunks]

    embeddings = embed_model.get_text_embedding_batch(texts)

    vectors = []
    for i in range(len(chunks)):
        vector_metadata = chunks[i]["metadata"].copy()
        vectors.append({
            "key": keys[i],
            "data": {"float32": embeddings[i]},
            "metadata": vector_metadata
        })

    response = s3v.put_vectors(
        vectorBucketName=vector_bucket_name,
        indexName=index_name,
        vectors=vectors
    )

    return response


def process_documents(bucket, keys):
    stats = {
        "processed": 0, "failed": 0,
        "pdf": 0, "docx": 0, "txt": 0, "msg": 0, "doc": 0,
        "ocr": 0, "none": 0
    }

    all_chunks = []

    for idx, key in enumerate(keys, 1):
        print(f"\n📄 Processing {idx}/{len(keys)}: {key}")
        ext = os.path.splitext(key)[1].lower()
        s3_path = f"s3://{bucket}/{key}"

        try:
            file_io = download_s3_file(bucket, key)
            text, method = extract_text(file_io, ext)

            if not text:
                print("⚠️ No text extracted.")
                stats["failed"] += 1
                stats["none"] += 1
                continue

            chunks = chunk_text_with_overlap(text, s3_path)
            print(f"✅ Extracted {len(chunks)} chunks.")
            # sample = chunks[0]['text'][:80].replace('\n', ' ')
            sample = chunks
            print(f"📝 Sample: {sample}...")

            all_chunks.extend(chunks)
            stats["processed"] += 1
            stats[ext.replace(".", "")] += 1
            if "ocr" in method: stats["ocr"] += 1

        except Exception as e:
            print(f"❗ Error: {e}")
            stats["failed"] += 1
            stats["none"] += 1

    print("\n=== Summary ===")
    for k, v in stats.items():
        print(f"{k.capitalize().replace('_', ' ')}: {v}")

    return stats, all_chunks

def read_s3_keys_from_excel(excel_path, sheet_name="S3 File Paths", sample_size=1000):
    print(f"📥 Reading S3 paths from Excel: {excel_path}, sheet: {sheet_name}")
    df = pd.read_excel(excel_path, sheet_name=sheet_name, engine="openpyxl")

    file_columns = [col for col in df.columns if col.startswith("S3 File")]

    if not file_columns:
        raise ValueError(f"No columns starting with 'S3 File' found in sheet '{sheet_name}'")

    all_paths = []
    for col in file_columns:
        for cell in df[col].dropna():
            path = str(cell).strip()
            if path:
                all_paths.append(path)

    print(f"🔍 Total paths found: {len(all_paths)}")

    valid_keys = []
    invalid_paths = []

    for path in all_paths:
        if path.startswith("s3://"):
            parts = path.replace("s3://", "").split("/", 1)
            if len(parts) == 2 and parts[1].strip():
                valid_keys.append(parts[1])
            else:
                invalid_paths.append(path)
        else:
            invalid_paths.append(path)

    print(f"📁 Parsed S3 keys: {len(valid_keys)}")
    print(f"⚠️ Skipped invalid paths: {len(invalid_paths)}")

    if invalid_paths:
        print("\n⚠️ Skipped Paths (sample):")
        for bad in invalid_paths[:10]:
            print(f" - {bad}")
        if len(invalid_paths) > 10:
            print(f" ...and {len(invalid_paths) - 10} more.")

    selected_keys = random.sample(valid_keys, min(sample_size, len(valid_keys)))
    print(f"🎯 Randomly selected {len(selected_keys)} files.")
    return selected_keys

if __name__ == "__main__":
    S3_BUCKET = "ml-legal-restricted"
    EXCEL_PATH = "full_contracts_with_files.xlsx"
    SAMPLE_SIZE = 2
    VECTOR_BUCKET_NAME = "legal-docs-vector-store"
    # INDEX_NAME = "token-chunking"
    INDEX_NAME = "overlap-chunking"
    # INDEX_NAME = "semantic-split-chunking"

    file_keys = read_s3_keys_from_excel(EXCEL_PATH, sample_size=SAMPLE_SIZE)
    file_keys = ['contract-docs/68710/Approval Global Intellectuals SOW Sai Kiran Mandhala.docx']
    final_stats, chunks = process_documents(S3_BUCKET, file_keys)
    response = upload_chunks_to_s3_vector_index(chunks, embed_model, VECTOR_BUCKET_NAME, INDEX_NAME)
    if response:
        print(f"✅ Successfully uploaded to {INDEX_NAME}")



📥 Reading S3 paths from Excel: full_contracts_with_files.xlsx, sheet: S3 File Paths
🔍 Total paths found: 47933
📁 Parsed S3 keys: 47933
⚠️ Skipped invalid paths: 0
🎯 Randomly selected 2 files.

📄 Processing 1/1: contract-docs/68710/Approval Global Intellectuals SOW Sai Kiran Mandhala.docx
No Text
❗ OCR failed: Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

⚠️ No text extracted.

=== Summary ===
Processed: 0
Failed: 1
Pdf: 0
Docx: 0
Txt: 0
Msg: 0
Doc: 0
Ocr: 0
None: 1


ParamValidationError: Parameter validation failed:
Invalid length for parameter vectors, value: 0, valid min length: 1

In [53]:
response = s3v.list_vectors(
    vectorBucketName=VECTOR_BUCKET_NAME,
    indexName=INDEX_NAME,
    returnData=True,
    returnMetadata=True
)

vectors = response.get("vectors", [])
print(len(vectors))

for vector in vectors:
    print(f"Key: {vector['key']}")
    print(f"Metadata: {vector.get('metadata',{})}")
    print(f"Embedding (first 5 dims): {vector['data']['float32'][:5]}")
    print("--------------")

next_token = response.get("nextToken")
if next_token:
    pass


60
Key: 3bccdeab-fe5b-4e78-9775-be14e78dcdaa
Metadata: {'s3_path': 's3://ml-legal-restricted/contract-docs/60581/Horan - Cotiviti - DUA - AS - 11102020 - Dynamics_60581.pdf', 'text': '1 \n   Non-Disclosure Agreement \nThis NON-DISCLOSURE AGREEMENT  (“Agreement”) is entered into by and between Cotiviti, Inc. \nfor itself and its wholly owned affiliates and subsidiaries (“Cotiviti”) and UMR, Inc. ( UMR entity) for \nitself and its affiliated companies (“ UMR ”) and will be effective on 10/13/2020. Cotiviti and UMR may \nindividually be referred to herein as a “Party” and may jointly be referred to together as the “Parties.” The \nParties acknowledge and agree as follows: \n A. Purpose:  Acme Truck Line, Inc. (“Employer”) and UMR entered into administrative services \nagreement under which UMR provides claims administration and other services for Employer’s employee \nwelfare benefit plan ( “Plan”). Horan Associates Inc (“Consultant”), has subcontracted with Cotiviti, Inc. \n(“Cotiviti”) 