In [1]:
from pymilvus import MilvusClient
import numpy as np
import os
import openai
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm
import numpy as np
import hashlib
import time
import fitz

In [2]:
import re

def clean_pdf_text(text):
    # Replace non-breaking spaces with regular space
    text = text.replace('\xa0', ' ')
    
    # Remove line breaks that aren't followed by a new paragraph or sentence
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    
    # Remove extra newlines
    text = re.sub(r'\n+', '\n', text)

    # Collapse multiple spaces into one
    text = re.sub(r' +', ' ', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

In [3]:
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
folder_path = "Supply_Chain_TBs"  # Make sure this folder exists

for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):  # Only process PDF files
        print(f"Processing: {filename}")
        
        # ✅ Fix: Join folder path with filename
        file_path = os.path.join(folder_path, filename)
        
        try:
            doc = fitz.open(file_path)
            text = ""
            for page in doc:
                text += page.get_text()
            doc.close()
            text = clean_pdf_text(text)
            doc_chunks = splitter.split_text(text)
            print("First 10 chunks:")
            print(doc_chunks[:10])
            break  # Remove this if you want to process all files
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

Processing: 978-3-031-85508-5.pdf
First 10 chunks:
['International Series in Operations Research & Management Science Dmitry Ivanov Alexandre Dolgui Boris Sokolov Editors Handbook of Ripple Effects in the Supply Chain Second Edition International Series in Operations Research & Management Science Founding Editor Frederick S. Hillier Volume 276 Series Editors Camille C. Price, Department of Computer Science, Stephen F. Austin State University, Nacogdoches, USA Michel Gendreau, Dept. of Mathematical & Industrial Eng., Polytechnique de Montreal, Montréal, QC, Canada Editorial Board Members Emanuele Borgonovo, Department of Decision Sciences, Bocconi University, Milan, Italy Barry L. Nelson, Department of Industrial Engineering and Management Sciences, Northwestern University, Evanston, USA Bruce W. Patty, Veritec Solutions, Mill Valley, USA', 'Northwestern University, Evanston, USA Bruce W. Patty, Veritec Solutions, Mill Valley, USA Michael Pinedo, Stern School of Business, New York Unive

In [2]:
client = MilvusClient("./milvus_large3.db")

In [8]:
results = client.describe_collection(
    collection_name="all_supply_chain_books"
)

print(f"✅ Total vectors in collection: {results}")

✅ Total vectors in collection: {'collection_name': 'all_supply_chain_books', 'auto_id': False, 'num_shards': 0, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 3072}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': True}


In [4]:
from dotenv import load_dotenv
import os
from openai import AzureOpenAI

load_dotenv()

client_embed = AzureOpenAI(
  api_key = os.getenv("AZURE_embed3lrg_API_KEY"),  
  api_version = "2024-10-21",
  azure_endpoint = os.getenv("AZURE_embed3lrg_API_BASE") 
)

In [6]:
client.create_collection(collection_name = "all_supply_chain_books", dimension = 3072)
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=3072),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=4096),
]
schema = CollectionSchema(fields, description="Book Chunks for RAG")

In [7]:
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
folder_path = "Supply_Chain_TBs"  # Make sure this folder exists

counter = 0

for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):  # Only process PDF files
        print(f"Processing: {filename}")
        
        # ✅ Fix: Join folder path with filename
        file_path = os.path.join(folder_path, filename)
        
        try:
            doc = fitz.open(file_path)
            text = ""
            for page in doc:
                text += page.get_text()
            doc.close()
            text = clean_pdf_text(text)
            doc_chunks = splitter.split_text(text)
            for chunk in tqdm(doc_chunks):
                response = client_embed.embeddings.create(
                    input = chunk,
                    model= "text-embedding-3-large"
                )
                data = [{
                    "id": counter,
                    "vector": response.data[0].embedding,
                    "text": chunk,
                    "metadata": {"source": filename}
                }]
                client.insert(collection_name="all_supply_chain_books", data = data)
                counter += 1
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

Processing: 978-3-031-85508-5.pdf


100%|███████████████████████████████████████| 1701/1701 [03:25<00:00,  8.28it/s]


Processing: 978-3-658-37103-6.pdf


100%|█████████████████████████████████████████| 465/465 [00:56<00:00,  8.23it/s]


Processing: 978-3-030-88662-2.pdf


100%|███████████████████████████████████████| 1206/1206 [02:27<00:00,  8.19it/s]


Processing: 9789286158070_pdf.pdf


100%|█████████████████████████████████████████| 177/177 [00:22<00:00,  7.89it/s]


Processing: 2021_Book_NextGenerationSupplyChains.pdf


100%|███████████████████████████████████████| 1120/1120 [02:18<00:00,  8.08it/s]


Processing: 978-3-030-98640-7.pdf


100%|█████████████████████████████████████████| 743/743 [01:29<00:00,  8.28it/s]


Processing: 978-3-031-39675-5.pdf


100%|███████████████████████████████████████| 1231/1231 [02:29<00:00,  8.24it/s]


Processing: 978-3-319-94358-9.pdf


100%|███████████████████████████████████████| 1727/1727 [03:32<00:00,  8.13it/s]


Processing: 978-981-97-1647-0.pdf


100%|███████████████████████████████████████| 1538/1538 [03:07<00:00,  8.19it/s]


Processing: Lean Warehousing.pdf


100%|█████████████████████████████████████████| 264/264 [00:31<00:00,  8.50it/s]


Processing: 9781003809401.pdf


100%|███████████████████████████████████████| 1041/1041 [02:03<00:00,  8.43it/s]


Processing: 1002050.pdf


100%|█████████████████████████████████████████| 583/583 [01:12<00:00,  8.05it/s]


Processing: 408877.pdf


100%|█████████████████████████████████████████| 896/896 [01:47<00:00,  8.32it/s]


Processing: 9781040119556.pdf


100%|███████████████████████████████████████| 1111/1111 [02:17<00:00,  8.09it/s]


Processing: LSC_Analytics_Course_Notes_Spring_2025_Edit 20250128.pdf


100%|█████████████████████████████████████████| 458/458 [00:55<00:00,  8.29it/s]


In [12]:
query_results = client.query(
    collection_name="all_supply_chain_books",
    filter="",  # No filter
    output_fields=["id", "vector", "text"],  # Fields to return
    limit=3  # Number of records to show
)

print("\nFirst 3 Inserted Records:")
for i, record in enumerate(query_results):
    print(f"\nRecord {i+1}:")
    print(f"ID: {record['id']}")
    print(f"Text: {record['text']}...")


First 3 Inserted Records:

Record 1:
ID: 0
Text: International Series in Operations Research & Management Science Dmitry Ivanov Alexandre Dolgui Boris Sokolov Editors Handbook of Ripple Effects in the Supply Chain Second Edition International Series in Operations Research & Management Science Founding Editor Frederick S. Hillier Volume 276 Series Editors Camille C. Price, Department of Computer Science, Stephen F. Austin State University, Nacogdoches, USA Michel Gendreau, Dept. of Mathematical & Industrial Eng., Polytechnique de Montreal, Montréal, QC, Canada Editorial Board Members Emanuele Borgonovo, Department of Decision Sciences, Bocconi University, Milan, Italy Barry L. Nelson, Department of Industrial Engineering and Management Sciences, Northwestern University, Evanston, USA Bruce W. Patty, Veritec Solutions, Mill Valley, USA...

Record 2:
ID: 1
Text: Northwestern University, Evanston, USA Bruce W. Patty, Veritec Solutions, Mill Valley, USA Michael Pinedo, Stern School of Busi

In [24]:
question = "What is ripple effect in supply chain?"

response = client_embed.embeddings.create(
                    input = question,
                    model= "text-embedding-3-large"
                )
search_res = client.search(
    collection_name="all_supply_chain_books",
    data=[
        response.data[0].embedding
    ],  
    limit=5,  
    search_params={"metric_type": "COSINE", "params": {}},  
    output_fields=["text"],  
)

In [25]:
import json

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        "as an analogy to computer science, where the ripple effect determines the disruption-based scope of changes in the system (Hosseini et al., 2020; Proselkov et al., 2024). The ripple effect is a phenomenon of disruption propagations in the supply chain and their impact on output supply chain performance (e.g. sales, on-time delivery and total pro\ufb01t). It may have more serious consequences than just short- term performance decrease. It can result in market share losses (e.g. Toyota lost its market leader position after the tsunami in 2011 and needed to redesign its supply chain coordination mechanism). The ripple effect is also known as \u201cdomino effect\u201d or \u201csnowball effect.\u201d The reasons for ripple effect are not dif\ufb01cult to \ufb01nd. With increasing supply chain complexity and consequent pressure",
        0.8194840550422668
    ],
    [
        "effect are not dif\ufb01cult to \ufb01nd. With increasing supply chain complexity and consequent p

In [14]:
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)

In [15]:
endpoint = os.getenv("AZURE_AI_gpt41_API_BASE")
api_version = "2025-01-01-preview"
api_key = os.getenv("AZURE_AI_gpt41_API_KEY")

client_gpt = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=api_key,
)

In [16]:
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

In [None]:
SYSTEM_PROMPT = """

"""

In [21]:
response = client_gpt.chat.completions.create(
    model=api_version,
    messages=[
        {"role": "system", "content": "You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided"},
        {"role": "user", "content": USER_PROMPT},
    ],
    temperature = 0.4
)
print(response.choices[0].message.content)

The ripple effect in the supply chain refers to the phenomenon where a disruption at one point in the supply chain propagates throughout the entire network, impacting overall supply chain performance. This effect can lead to issues such as decreased sales, missed on-time deliveries, reduced total profit, and even long-term consequences like market share losses. The ripple effect is triggered by high-impact, low-frequency events (e.g., natural disasters, plant explosions) that cannot be localized and thus cascade downstream, causing missing materials, production decreases, and further disruptions at subsequent stages. It is also known as the “domino effect” or “snowball effect.” The ripple effect is distinct from the bullwhip effect, as it involves structural disruptions and exceptional risks rather than operational fluctuations. Preventing or mitigating the ripple effect requires proactive redundancy, flexibility, and significant coordination efforts for both short-term stabilization a