# Fully local webscraper for manufacturing

### ENPM692 Final Project
Vinay Lanka | Apoorv Thapliyal | Harsh Senjaliya

## Imports

In [1]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Langchain setup
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_ollama import OllamaLLM
import json, re, textwrap, time
from typing import Dict, List, Tuple

#DDG
from duckduckgo_search import DDGS

# Fetching HTML
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer
from typing import List
from langchain.schema import BaseRetriever, Document

# Web scraping
import asyncio

# Database
from bs4 import BeautifulSoup
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms.base import BaseLLM
from langchain.prompts import PromptTemplate

import chromadb
from chromadb.config import Settings
from langchain_community.embeddings import HuggingFaceEmbeddings
from chromadb.config import Settings, DEFAULT_TENANT, DEFAULT_DATABASE
from langchain_community.vectorstores import Chroma

import os, glob
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.vectorstores import Chroma, FAISS
import shutil
from chromadb.config import Settings

from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage
import base64, pathlib

  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


## Captioning


In [2]:
# Hugging Face Image Captioning Model Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Print device information
print(f"Using device: {device}")

Using device: cuda


### Using Gemma to generate captions

In [None]:
img_b64 = base64.b64encode(pathlib.Path("imgs/pencil.jpeg").read_bytes()).decode()

content = [
    {  # image part
        "type": "image_url",
        "image_url": f"data:image/jpeg;base64,{img_b64}",
    },
    {  # text prompt part
        "type": "text",
        "text": "Identify the primary object in this image and enumerate all observable material characteristics—such as base material, surface finish, color, texture, gloss level, or coating.  Ignore background elements and give the answer in one clear English sentence.",
    },
]

llm = ChatOllama(model="gemma3:4b", temperature=0.3)
object_desc = llm.invoke([HumanMessage(content=content)]).content

print(object_desc)

The primary object in the image is a red plastic water bottle with a gray lid and handle, exhibiting a glossy, smooth surface finish with a white graphic design printed on its exterior.


## Static DB

### Setup static db 

Chroma DB with Persistence
Embedding LLM - all-MiniLM-L12-v2

Converts documents under `/data` to vector embeddings under `/db`.
Place any documents to be stored under `/data`.
To be run once.

In [4]:
static_dir = "db/sdb"
embedder   = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2")

client = chromadb.PersistentClient(
    path=static_dir,
    settings=Settings(),          # you can pass custom Settings here if needed
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

if os.path.exists(static_dir) and os.listdir(static_dir):
    print(f"Loading existing vector store from {static_dir}")
    vectordb_static = Chroma(
        client=client,
        embedding_function=embedder,
        persist_directory=static_dir
    )

else:
    print(f"Building new vector store in {static_dir}")
    if os.path.exists(static_dir):
        shutil.rmtree(static_dir)
    os.makedirs(static_dir, exist_ok=True)

    # load PDFs
    pdf_docs = []
    for path in glob.glob("data/*.pdf"):
        loader = PyPDFLoader(path)
        pdf_docs.extend(loader.load())

    # split into chunks
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=200,
        length_function=len
    )
    static_chunks = splitter.split_documents(pdf_docs)

    # build & persist
    vectordb_static = Chroma.from_documents(
        documents=static_chunks,
        embedding=embedder,
        client=client,
        persist_directory=static_dir
    )
    client.persist() 

  embedder   = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2")


Loading existing vector store from db/sdb


  vectordb_static = Chroma(


### Create a retreiver

Pulling k = 6 relevant chunks

In [5]:
static_ret  = vectordb_static.as_retriever(
    search_kwargs={"k": 6}
)

## Generate queries

The LLM Model: Ollama gemma3:4b model will be used for query generation as well as final manufacturing document generation

In [6]:
# Initialize your LLM
llm = OllamaLLM(model="gemma3:4b", temperature=0.3)

query_prompt = PromptTemplate.from_template(textwrap.dedent("""
You are a manufacturing‐research assistant.

Goal ▸ Draft **{k}** DuckDuckGo search queries that, taken together,
will surface authoritative information for *all* of the report areas
listed under **SECTIONS** below.

Guidelines
• Draw wording **only** from the context slide excerpts, the object
  caption, and the section descriptions.  
• Mix high-level and specific terms so the result set spans every
  section (sustainability, process flow, tooling, simulation, etc.).  
• Avoid near-duplicate phrasing; each query should probe a different
  angle (process, material, KPI, cost, digital thread, …).  
• Do **not** introduce themes absent from the context.  
• Return **only** a valid JSON array of strings—no commentary.

---
CONTEXT
{context}
---

OBJECT DESCRIPTION
\"\"\"{caption}\"\"\"

SECTIONS (for your internal guidance; do NOT echo them)
1  Executive summary / Object overview
2  Sustainability & life-cycle considerations
3  Material selection & eco-alternatives
4  Manufacturing-process flow
5  Tooling, automation & industrial robotics
6  Digital & smart-manufacturing enablers
7  Simulation & virtual validation
8  Quality & performance metrics
9  Environmental & cost impact
10 Implementation roadmap / Smart-mfg priorities
---
"""))


def generate_queries(caption: str,
                     static_ret,
                     k_queries: int = 6,
                     k_ctx: int = 4) -> list[str]:


    docs = static_ret.get_relevant_documents(caption)[:k_ctx]
    ctx  = "\n\n".join(d.page_content[:800] for d in docs)


    chain  = LLMChain(llm=llm,
                      prompt=query_prompt.partial(k=k_queries))

    result_dict = chain.invoke({"caption": caption, "context": ctx})
    raw = result_dict["text"] if isinstance(result_dict, dict) else result_dict

    match = re.search(r"\[.*\]", raw, re.DOTALL)
    if not match:
        raise ValueError(f"JSON not found:\n{raw}")

    return [q.strip() for q in json.loads(match.group(0))]


In [7]:
search_queries = generate_queries(object_desc, static_ret, k_queries=10, k_ctx=6)
print("Generated queries:")
for q in search_queries:
    print(f"  • {q}")

  docs = static_ret.get_relevant_documents(caption)[:k_ctx]
  chain  = LLMChain(llm=llm,


Generated queries:
  • red plastic water bottle material composition
  • water bottle manufacturing process flow diagram
  • glossy plastic surface finish properties
  • water bottle tooling automation cost analysis
  • digital thread integration water bottle production
  • simulation virtual validation water bottle design
  • water bottle life cycle assessment sustainability
  • KPIs for plastic water bottle manufacturing quality
  • water bottle environmental impact carbon footprint
  • smart manufacturing priorities plastic bottle production


## Dynamic DB

### Duck-Duck-Go scraping for top N URLs

In [8]:
def ddg_search(query: str, max_results: int = 5) -> list[str]:
    """
    Returns the top‑N DuckDuckGo result URLs for a query.
    """
    with DDGS() as ddgs:
        return [hit["href"] for hit in ddgs.text(query, max_results=max_results)]

# Example usage:
# urls = [u for q in queries for u in ddg_search(q, max_results=5)]
# urls = list(dict.fromkeys(urls))  # dedupe while preserving order

In [9]:
async def fetch_documents(urls: list[str]):
    """
    Asynchronously downloads each page and converts HTML→plain text.
    Returns a list of LangChain Document objects.
    """
    loader    = AsyncHtmlLoader(urls)
    html_docs = await loader.load()                       # list[Document] with HTML in .page_content
    transformer = Html2TextTransformer()                  # strips tags, yields markdown‑style text :contentReference[oaicite:0]{index=0}
    text_docs = transformer.transform_documents(html_docs)
    return text_docs


In [10]:
all_urls = []
for q in search_queries:
    urls = ddg_search(q, max_results=5)
    print(f"\nQuery: {q}\n  URLs:")
    for u in urls:
        print("    •", u)
    all_urls.extend(urls)
    time.sleep(1)  # be nice to DDG

unique_urls = list(dict.fromkeys(all_urls))
print(f"\nTotal unique URLs: {len(unique_urls)}")



Query: red plastic water bottle material composition
  URLs:
    • https://www.sciencing.com/raw-materials-plastic-bottles-5747796/
    • https://www.chemistryislife.com/the-chemistry-of-plastic-bottles-pet-hdpe-ldpe
    • https://www.epa.gov/facts-and-figures-about-materials-waste-and-recycling/plastics-material-specific-data
    • https://healthywaterbottle.com/maintenance-health/what-are-plastic-water-bottles-made-of/
    • https://www.myownwater.com/blog/what-is-a-water-bottle-made-of

Query: water bottle manufacturing process flow diagram
  URLs:
    • https://ibottling.com/what-is-bottled-water-production-process-flow-chart/
    • https://www.hzmmachine.com/blog/bottled-water-production-process-pdf.html
    • https://dtppl.com/how-does-packed-drinking-water-manufacturing-facility-work/
    • https://blog.fhyzics.net/sop/sop-manual-for-bottled-water-manufacturing
    • https://www.sartorius.com/download/1085710/beverage-qc-water-workflow-poster-en-print-l-sartorius-pdf-data.pdf



### Fetch HTML Docs as plain text

In [11]:
N = len(unique_urls)
print(f"\nFetching and parsing first {N} available pages:")

html_docs = []
success_count = 0
failure_count = 0

for idx, url in enumerate(unique_urls[:N], start=1):
    single_loader = AsyncHtmlLoader([url])
    try:
        docs_for_url = await asyncio.wait_for(
            asyncio.to_thread(single_loader.load),
            timeout=10
        )
        html_docs.extend(docs_for_url)
        print(f"[{idx}/{N}] ✓ Fetched {url}")
        success_count += 1

    except asyncio.TimeoutError:
        failure_count += 1
        print(f"[{idx}/{N}] ✗ Timeout {url}")
    except Exception as e:
        failure_count += 1
        print(f"[{idx}/{N}] ✗ Error {url}: {e}")

print(f"\nCompleted: {success_count} succeeded, {failure_count} failed out of {N} URLs.\n")

# Transform and inspect as before
transformer = Html2TextTransformer()
docs = transformer.transform_documents(html_docs)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    length_function=len
)
ddb = splitter.split_documents(docs)


Fetching and parsing first 49 available pages:


Fetching pages: 100%|##########| 1/1 [00:00<00:00, 16.34it/s]


[1/49] ✓ Fetched https://www.sciencing.com/raw-materials-plastic-bottles-5747796/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.07it/s]


[2/49] ✓ Fetched https://www.chemistryislife.com/the-chemistry-of-plastic-bottles-pet-hdpe-ldpe


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  5.44it/s]


[3/49] ✓ Fetched https://www.epa.gov/facts-and-figures-about-materials-waste-and-recycling/plastics-material-specific-data


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  6.87it/s]


[4/49] ✓ Fetched https://healthywaterbottle.com/maintenance-health/what-are-plastic-water-bottles-made-of/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  6.34it/s]


[5/49] ✓ Fetched https://www.myownwater.com/blog/what-is-a-water-bottle-made-of


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.52it/s]


[6/49] ✓ Fetched https://ibottling.com/what-is-bottled-water-production-process-flow-chart/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.16it/s]


[7/49] ✓ Fetched https://www.hzmmachine.com/blog/bottled-water-production-process-pdf.html


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.61s/it]


[8/49] ✓ Fetched https://dtppl.com/how-does-packed-drinking-water-manufacturing-facility-work/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  4.64it/s]


[9/49] ✓ Fetched https://blog.fhyzics.net/sop/sop-manual-for-bottled-water-manufacturing


Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]Failed to decode content from https://www.sartorius.com/download/1085710/beverage-qc-water-workflow-poster-en-print-l-sartorius-pdf-data.pdf
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.00s/it]


[10/49] ✓ Fetched https://www.sartorius.com/download/1085710/beverage-qc-water-workflow-poster-en-print-l-sartorius-pdf-data.pdf


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.49it/s]


[11/49] ✓ Fetched https://omnexus.specialchem.com/polymer-property/gloss


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  9.42it/s]


[12/49] ✓ Fetched https://etcnmachining.com/blog/plastic-finishes/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.75it/s]


[13/49] ✓ Fetched https://www.richconn-cnc.com/surface-finish-101-an-overview-of-plastic-surface-finish-chart.html


Fetching pages: 100%|##########| 1/1 [00:02<00:00,  2.31s/it]


[14/49] ✓ Fetched https://mgsplastics.co.uk/surface-finishes-what-are-the-different-standards/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  5.99it/s]


[15/49] ✓ Fetched https://blog.mppcorp.net/the-types-of-surface-finishes-for-plastic-parts


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.57it/s]


[16/49] ✓ Fetched https://fillers-packer.com/blogs/cost-of-a-bottling-machine/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.23it/s]


[17/49] ✓ Fetched https://www.hzmmachine.com/blog/providing-exact-cost-estimates.html


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.58it/s]


[18/49] ✓ Fetched https://ibottling.com/cost-and-profit-analysis-of-a-bottled-water-facility-in-depth/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.56it/s]


[19/49] ✓ Fetched https://ibottling.com/industry-4-0-modern-water-bottle-blowing-machine/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.00it/s]


[20/49] ✓ Fetched https://ibottling.com/water-bottling-plant-costs-explained/


Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]Failed to decode content from https://productiondigitalthread.com/wp-content/uploads/2024/04/Digital-Thread-finalwhitepapers_jan2024.pdf
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.59it/s]


[21/49] ✓ Fetched https://productiondigitalthread.com/wp-content/uploads/2024/04/Digital-Thread-finalwhitepapers_jan2024.pdf


Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]Error fetching https://www.encirc360.com/2021/12/22/smart-bottling-connecting-the-data-with-a-digital-thread/ with attempt 1/3: Cannot connect to host www.encirc360.com:443 ssl:True [SSLCertVerificationError: (1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')]. Retrying...
Error fetching https://www.encirc360.com/2021/12/22/smart-bottling-connecting-the-data-with-a-digital-thread/ with attempt 2/3: Cannot connect to host www.encirc360.com:443 ssl:True [SSLCertVerificationError: (1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')]. Retrying...
Fetching pages:   0%|          | 0/1 [00:05<?, ?it/s]


[22/49] ✗ Error https://www.encirc360.com/2021/12/22/smart-bottling-connecting-the-data-with-a-digital-thread/: Cannot connect to host www.encirc360.com:443 ssl:True [SSLCertVerificationError: (1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')]


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.14s/it]


[23/49] ✓ Fetched https://link.springer.com/chapter/10.1007/978-981-19-5221-0_58


Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]Failed to decode content from https://tsapps.nist.gov/publication/get_pdf.cfm?pub_id=924828
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.11s/it]


[24/49] ✓ Fetched https://tsapps.nist.gov/publication/get_pdf.cfm?pub_id=924828


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  6.40it/s]


[25/49] ✓ Fetched https://hbr.org/sponsored/2022/02/four-ways-to-connect-digital-threads-with-simulation-and-realize-the-promise-of-industry-4-0


Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]Failed to decode content from https://coed.asee.org/wp-content/uploads/2020/08/10-Plastic-Bottle-Structure-Design-Technique-Using-Virtual-Reality-Linked-with-Finite-Element-Analysis.pdf
Fetching pages: 100%|##########| 1/1 [00:04<00:00,  4.26s/it]


[26/49] ✓ Fetched https://coed.asee.org/wp-content/uploads/2020/08/10-Plastic-Bottle-Structure-Design-Technique-Using-Virtual-Reality-Linked-with-Finite-Element-Analysis.pdf


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.76s/it]


[27/49] ✓ Fetched https://www.mesj.ukim.edu.mk/journals/article/view/71


Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]Failed to decode content from https://www.mesj.ukim.edu.mk/journals/article/download/71/72/113
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.39s/it]


[28/49] ✓ Fetched https://www.mesj.ukim.edu.mk/journals/article/download/71/72/113


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.50s/it]


[29/49] ✓ Fetched https://kinetic-vision.com/capabilities/simulation-validation/


Fetching pages: 100%|##########| 1/1 [00:00<00:00, 12.20it/s]


[30/49] ✓ Fetched https://www.researchgate.net/publication/337149329_Approach_methodology_for_the_sustainable_design_of_packaging_through_computational_tools_Case_study_Water_bottles


Fetching pages: 100%|##########| 1/1 [00:05<00:00,  5.91s/it]


[31/49] ✓ Fetched https://www.sciencedirect.com/science/article/pii/S0956053X18301090


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.71it/s]


[32/49] ✓ Fetched https://bottledwater.org/environmental-footprint/


Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]Failed to decode content from https://bottledwater.org/wp-content/uploads/2024/04/IBWA-Trayak-Full-Report-32321.pdf
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.42it/s]


[33/49] ✓ Fetched https://bottledwater.org/wp-content/uploads/2024/04/IBWA-Trayak-Full-Report-32321.pdf


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.02s/it]


[34/49] ✓ Fetched https://bottledwater.org/wp-content/uploads/attachments/IBWA_BottledWaterLCI_ExecutiveSummary_2017-10-24_Quantis+(003)Final.pdf


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.73s/it]


[35/49] ✓ Fetched https://www.sciencedirect.com/science/article/pii/S0013935121002681


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.10it/s]


[36/49] ✓ Fetched https://bplan.ai/blogs/kpi-metrics/plastic-bottle-manufacturing-kpi-metrics


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.83s/it]


[37/49] ✓ Fetched https://www.deskera.com/blog/plastic-manufacturing-kpis-measurement-effectiveness-case-studies/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  7.47it/s]


[38/49] ✓ Fetched https://www.someka.net/blog/quality-kpis/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.30it/s]


[39/49] ✓ Fetched https://finmodelslab.com/blogs/kpi-metrics/plastic-bottle-manufacturing


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.10it/s]


[40/49] ✓ Fetched https://businessplan-templates.com/blogs/metrics/plastic-bottle-manufacturing


Fetching pages: 100%|##########| 1/1 [00:00<00:00, 15.63it/s]


[41/49] ✓ Fetched https://www.sciencing.com/carbon-footprint-plastic-bottle-12307187/


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.25s/it]


[42/49] ✓ Fetched https://vegeco.org/decoding-water-bottles-5-carbon-footprint-insights/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.15it/s]


[43/49] ✓ Fetched https://shapiroe.com/blog/bottled-water-environmental-impact/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.30it/s]


[44/49] ✓ Fetched https://tappwater.co/en-us/blogs/blog/carbon-footprint-bottled-water


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.81it/s]


[45/49] ✓ Fetched https://www.containerandpackaging.com/resources/plastic-bottle-manufacturing-trends-2025


Fetching pages: 100%|##########| 1/1 [00:00<00:00, 13.59it/s]


[46/49] ✓ Fetched https://www.packagingdigest.com/sustainability/smart-manufacturing-s-role-in-greater-packaging-sustainability


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.27s/it]


[47/49] ✓ Fetched https://happyeconews.com/the-future-of-plastic-manufacturing-sustainability-and-innovation/


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.70s/it]


[48/49] ✓ Fetched https://hannahgthompson.com/innovations-in-plastic-bottle-manufacture-advancements-driving-sustainability-and-efficiency/


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.37it/s]


[49/49] ✓ Fetched https://rcbottles.com/innovative-manufacturing-processes-for-plastic-bottles-balancing-efficiency-and-sustainability/

Completed: 48 succeeded, 1 failed out of 49 URLs.



### Convert fetched HTML into LangChain Documents

In [12]:
embedder = SentenceTransformerEmbeddings(
    model_name="all-MiniLM-L6-v2",
    model_kwargs={"device": "cpu"}
)

vectordb_dynamic = FAISS.from_documents(
    documents=ddb,    # your list of Document chunks
    embedding=embedder
)

## Create Retrievers

In [13]:
class CombinedRetriever(BaseRetriever):
    retrievers: List[BaseRetriever]
    k: int = 5

    def get_relevant_documents(self, query: str) -> List[Document]:
        docs: List[Document] = []
        for r in self.retrievers:
            docs.extend(r.get_relevant_documents(query))
        return docs[: self.k]


dynamic_ret = vectordb_dynamic.as_retriever(
    search_kwargs={"k": 6}
)

combined_retriever = CombinedRetriever(
    retrievers=[static_ret, dynamic_ret],
    k=12
)


  class CombinedRetriever(BaseRetriever):


## Ask the LLM

Finally, prompt the LLM with the final query

In [None]:
first_prompt = PromptTemplate.from_template("""
You are a senior manufacturing engineer.

Use the **context** block (course‐slide excerpts + web snippets/URLs) as your only source.
Let the lecture slides define the structure, emphasis, and terminology—shape all web‐sourced
details to fit those slide‐based frameworks.  In other words, pull facts from URLs **only** to
illustrate slide‐prescribed methods, metrics, or priorities, and never stray outside the slide
guidance.

┌───────────────────────────────────────────────────────────────┐
│  Slides commonly cited                                        │
│  • Lecture 1  – course intro & final-project guidance         │
│  • Lecture 4  – sustainable-manufacturing overview, 6 R’s     │
│  • Lecture 5  – unit-process KPIs & material I/O tables       │
└───────────────────────────────────────────────────────────────┘


**Write these three sections (≈ 500 words each), each starting with exactly `## <section title>`:**
- Executive summary / Object overview 
    • Provide a concise description of the object, its primary purpose, and scope.  
    • Summarize the high-level manufacturing objectives and key expected outcomes.  
    • Highlight any unique design features or performance targets. 
- Sustainability & life-cycle considerations  
   • Frame the product within a triple-bottom-line perspective (environmental, social, economic).  
   • Outline the life-cycle stages (raw material sourcing, production, use, end-of-life) with a placeholder LCA flow diagram.  
   • List key sustainability KPIs (e.g., carbon footprint per unit, energy consumption, recyclability rate).
- Material selection & eco-alternatives  
   • Identify primary material properties required (strength, durability, chemical resistance).  
   • Compare candidate materials in a table: embodied energy, cost, recyclability, material toxicity.  
   • Propose eco-alternative materials or blends (bio-based polymers, recycled feedstocks) and substitution strategies.

Formatting rules  
• Start every section with `## <section title>` (no numbers, no bold).  
• Write in formal, technical style with bullets under each heading.  
• Cite every factual claim: `(Lecture #)` or `(URL n)`.  
• Never invent content or sources.

---
CONTEXT
{context}
---

Question → "{question}"

Answer:
""")

second_prompt = PromptTemplate.from_template("""
You are a senior manufacturing engineer.

Use the **context** block (course‐slide excerpts + web snippets/URLs) as your only source.
Let the lecture slides define the structure, emphasis, and terminology—shape all web‐sourced
details to fit those slide‐based frameworks.  In other words, pull facts from URLs **only** to
illustrate slide‐prescribed methods, metrics, or priorities, and never stray outside the slide
guidance.

┌───────────────────────────────────────────────────────────────┐
│  Key slide sources for these sections                         │
│  • Lectures 2-3 – additive & traditional process chains        │
│  • Lecture 5  – unit-process stage/KPI charts                  │
│  • Lecture 7  – industrial-robot anatomy, automation levels    │
│  • Lecture 1  – automation-level pyramid (overview)            │
│  • Lecture 9  – digital-manufacturing information models       │
│  • Lecture 10 – network-centric / smart-manufacturing decks    │
└───────────────────────────────────────────────────────────────┘

**Write the following sections — target ≈ 500 words EACH — with headings:**

- Manufacturing-process flow  
    • Provide a layer-by-layer or unit-operation flow chart description.  
    • Include placeholders for cycle-time/takt-time where slides give cues.

- Tooling, automation & industrial robotics  
    • Bill-of-tooling and robot-cell layout derived from Lecture 7 diagrams.  
    • Map each tool/robot to the step in Section 4; outline an automation-migration strategy.

- Digital & smart-manufacturing enablers  
    • Summarise digital-thread architecture and data-interoperability standards  
        (ISA-95, RAMI 4.0) referenced in Lectures 9 & 10.  
    • List IoT sensors or MES data points that close the loop.

Formatting rules  
• Begin every section with `## <section title>` (Markdown H-2).  
• Write in a formal, technical style, with bulleted points where appropriate.
• Cite every factual claim: `(Lecture #)` for slides, `(URL n)` for
  web snippets where *n* is the URL’s position in the context list.  
• Never invent content or sources.

---
CONTEXT
{context}
---

Question → "{question}"

Answer:
""")

third_prompt = PromptTemplate.from_template("""
You are a senior manufacturing engineer.

Use the **context** block (course‐slide excerpts + web snippets/URLs) as your only source.
Let the lecture slides define the structure, emphasis, and terminology—shape all web‐sourced
details to fit those slide‐based frameworks.  In other words, pull facts from URLs **only** to
illustrate slide‐prescribed methods, metrics, or priorities, and never stray outside the slide
guidance.

┌────────────────────────────────────────────────────────────────────────┐
│  Slide sets most relevant to these sections                           │
│  • Lecture 8  – simulation in manufacturing                            │
│  • Lecture 5  – KPI hierarchy & unit-process metrics                   │
│  • Lecture 4  – life-cycle costing & environmental impact              │
│  • Lecture 10 – smart-manufacturing “ten priority actions”             │
└────────────────────────────────────────────────────────────────────────┘

**Write the following sections — target ≈ 500 words EACH — with headings:**

- Simulation & virtual validation  
    • Explain the rationale for discrete-event simulation.  
    • Sketch a SIM-model swim-lane or text diagram.  
    • List KPIs to test and add a bullet list of risks/benefits.

- Quality & performance metrics  
    • Build a three-tier KPI table (process, product, sustainability) based on Lecture 5.  
    • Mention gauge R&R or similar validation where cited.

- Environmental & cost impact  
    • Provide an energy/CO₂-per-unit chart description and a cost-break-down structure.  
    • Summarise key life-cycle-cost drivers and note “win-win” trade-offs.

- Implementation roadmap / Smart-mfg priorities  
    • Draft a phased Gantt-style narrative with migration milestones.  
    • Align each phase to the “ten priority actions” slide (Lecture 10).  
    • Include a brief stakeholder-responsibility matrix.

Formatting rules  
• Begin every section with `## <section title>` (Markdown H-2).  
• Write in a formal, technical style, with bulleted points where appropriate.
• Cite every factual claim: `(Lecture #)` for slides, `(URL n)` for
  web snippets where *n* is the URL’s position in the context list.  
• Never invent content or sources.

---
CONTEXT
{context}
---

Question → "{question}"

Answer:
""")

section_batches: List[Tuple[str, PromptTemplate, str]] = [
    (
        "Sections 1–3",
        first_prompt,
        "\n".join([
            "1. **Executive summary / Object overview**",
            "2. **Sustainability & life-cycle considerations**",
            "3. **Material selection & eco-alternatives**",
            "",
            f"Object description: {object_desc}"
        ])
    ),
    (
        "Sections 4–6",
        second_prompt,
        "\n".join([
            "4. **Manufacturing-process flow**",
            "5. **Tooling, automation & industrial robotics**",
            "6. **Digital & smart-manufacturing enablers**",
            "",
            f"Object description: {object_desc}"
        ])
    ),
    (
        "Sections 7–10",
        third_prompt,
        "\n".join([
            "7. **Simulation & virtual validation**",
            "8. **Quality & performance metrics**",
            "9. **Environmental & cost impact**",
            "10. **Implementation roadmap / Smart-mfg priorities**",
            "",
            f"Object description: {object_desc}"
        ])
    ),
]

def run_batch(
    prompt_tpl: PromptTemplate,
    question: str
) -> str:
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=combined_retriever,
        chain_type="stuff",
        chain_type_kwargs={
            "prompt": prompt_tpl,
            "document_variable_name": "context",
        },
        input_key="question",
        return_source_documents=False,
    )
    out = qa({"question": question})
    return out["result"] if isinstance(out, dict) else out

def clean_to_first_h2(md: str) -> str:
    idx = md.find("## ")
    return md[idx:].strip() if idx != -1 else md.strip()

report_sections: Dict[str,str] = {}
for batch_name, prompt_tpl, question in section_batches:
    print(f"Generating {batch_name}…")
    raw = run_batch(prompt_tpl, question)
    report_sections[batch_name] = clean_to_first_h2(raw)

with open("manufacturing_report.md", "w") as md:
    for key in ["Sections 1–3", "Sections 4–6", "Sections 7–10"]:
        md.write(report_sections[key] + "\n\n")

print("✅ Done writing manufacturing_report.md")

Generating Sections 1–3…
Generating Sections 4–6…
Generating Sections 7–10…
✅ Done writing manufacturing_report.md


### Executive Overview & Sustainability Foundations

In [113]:
first_prompt = PromptTemplate.from_template("""
You are a senior manufacturing engineer.

Use ONLY the information in the **context** block (course-slide excerpts
and web snippets/URLs) to draft the first three sections of a manufacturing
report.  Let the slides guide the emphasis; enrich with web details when the
context includes them.

┌───────────────────────────────────────────────────────────────┐
│  Slides commonly cited                                        │
│  • Lecture 1  – course intro & final-project guidance         │
│  • Lecture 4  – sustainable-manufacturing overview, 6 R’s     │
│  • Lecture 5  – unit-process KPIs & material I/O tables       │
└───────────────────────────────────────────────────────────────┘

**Write these three sections — ~500 words EACH — and label them exactly:**

1. Executive summary / Object overview  
2. Sustainability & life-cycle considerations  
3. Material selection & eco-alternatives  

Formatting rules  
• Start each section with `## <section title>` (Markdown H-2).  
• Inline cite every factual claim: `(Lecture #)` for slides, `(URL n)` for
  web snippets where *n* is the URL’s position in the context list.  
• If a necessary fact is missing, omit it — never invent content or sources.

---
CONTEXT
{context}
---

Question → "{question}"

Answer:
""")

question = (
    "Task: Draft a detailed section 1,2,3 sections of a manufacturing guide (≈ 1000 words)\n\n"
    "1. **Executive summary / Object overview**\n\n"
    "2. **Sustainability & life-cycle considerations**\n\n"
    "3. **Material selection & eco-alternatives**\n\n"
    "Ground rules:\n"
    "1. The **Object description** below is the object being manufactured.\n"
    "2. All other statements MUST be supported by the CONTEXT block.\n"
    "3. Do not fabricate data or references. \n\n"
    f"Object description: {object_desc}"
)


qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=combined_retriever,
    chain_type="stuff",
    chain_type_kwargs={
        "prompt": first_prompt,
        "document_variable_name": "context",
    },
    input_key="question",          # Match the prompt
    return_source_documents=True,
)

result = qa_chain({"question": question})

In [115]:
print(result["result"])

Okay, here's a draft of the three sections you requested, based on the provided context. I've aimed for approximately 1000 words total, prioritizing detail and accuracy based on the provided sources.  I've included citations at the end for easy reference. **Please read the notes at the very end of this response – they are crucial for understanding the limitations and assumptions made in this draft.**

---

**1. Executive Summary / Object Overview: The Rubik's Cube – A Manufacturing Perspective**

The Rubik's Cube, initially known as the Magic Cube, is a globally recognized mechanical puzzle invented by Hungarian architect Ernő Rubik in 1974. It quickly transcended its origins as a teaching tool, becoming a cultural icon and a symbol of problem-solving and ingenuity. This manufacturing guide details the production process of a standard 3x3x3 Rubik’s Cube, focusing on material selection, assembly, and emerging sustainability considerations.  The core challenge in Rubik's Cube manufacturi

### Process Flow, Automation & Digital Enablers

In [78]:
second_prompt = PromptTemplate.from_template("""
You are a senior manufacturing engineer.

Using ONLY the **context** block (slide excerpts and web snippets/URLs),
draft **Sections 4-6** of the manufacturing report.  Prioritise the listed
lectures when choosing evidence; add web details only if they appear in the
context.

┌───────────────────────────────────────────────────────────────┐
│  Key slide sources for these sections                         │
│  • Lectures 2-3 – additive & traditional process chains        │
│  • Lecture 5  – unit-process stage/KPI charts                  │
│  • Lecture 7  – industrial-robot anatomy, automation levels    │
│  • Lecture 1  – automation-level pyramid (overview)            │
│  • Lecture 9  – digital-manufacturing information models       │
│  • Lecture 10 – network-centric / smart-manufacturing decks    │
└───────────────────────────────────────────────────────────────┘

**Write the following sections — target ≈ 500 words EACH — with headings:**

4. Manufacturing-process flow  
   • Provide a layer-by-layer or unit-operation flow chart description.  
   • Include placeholders for cycle-time/takt-time where slides give cues.

5. Tooling, automation & industrial robotics  
   • Bill-of-tooling and robot-cell layout derived from Lecture 7 diagrams.  
   • Map each tool/robot to the step in Section 4; outline an automation-migration strategy.

6. Digital & smart-manufacturing enablers  
   • Summarise digital-thread architecture and data-interoperability standards  
     (ISA-95, RAMI 4.0) referenced in Lectures 9 & 10.  
   • List IoT sensors or MES data points that close the loop.

Formatting rules  
• Begin every section with `## <section title>` (Markdown H-2).  
• Inline-cite every factual claim: `(Lecture #)` for slides, `(URL n)` for
  web snippets where *n* is the order of the URL in the context list.  
• If the context lacks specific details, leave them blank or mark “TBD”
  — never invent numbers, standards, or sources.

---
CONTEXT
{context}
---

Question → "{question}"

Answer:
""")

question = (
    "Task: Draft a detailed section 4,5,6 of a manufacturing guide (≈ 1000 words)\n\n"
    "4. **Manufacturing-process flow**  \n\n"
    "5. **Tooling, automation & industrial robotics**  \n\n"
    "6. **Digital & smart-manufacturing enablers**  \n\n"
    "Ground rules:\n"
    "1. The **Object description** below is the object being manufactured.\n"
    "2. All other statements MUST be supported by the CONTEXT block.\n"
    "3. Do not fabricate data or references. \n\n"
    f"Object description: {object_desc}"
)


qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=combined_retriever,
    chain_type="stuff",
    chain_type_kwargs={
        "prompt": second_prompt,
        "document_variable_name": "context",
    },
    input_key="question",          #  <<< match the prompt
    return_source_documents=True,
)

result = qa_chain({"question": question})


In [79]:
print(result["result"])

## Section 4: Manufacturing Process Flow – Rubik’s Cube Production

The production of a Rubik’s Cube is a surprisingly complex process, involving multiple stages of molding, assembly, and finishing. This section details the key steps involved, drawing directly from the provided context.

**4.1 Molding the Core & Center Cubes:**

The foundation of the Rubik’s Cube is the nylon core, followed by the ABS center cubes. The process begins with the injection molding of the nylon core. The context highlights the importance of material shrinkage, stating, “Different plastics will have a different shrink rate, and each tool must be specifically designed for the material that will be used.” This emphasizes the precision required to ensure consistent cube dimensions. The molding process utilizes ABS (Acrylonitrile Butadiene Styrene) for the center cubes, again acknowledging the need for tailored tooling based on material properties. The process involves injecting molten ABS into two-piece molds, 

### Simulation, Metrics & Smart-Manufacturing Roadmap

In [80]:
third_prompt = PromptTemplate.from_template("""
You are a senior manufacturing engineer.

Rely exclusively on the **context** block (course-slide excerpts + web snippets/URLs)
to draft **Sections 7-10** of the manufacturing report.  Give priority to the lecture
slides noted below; supplement with web data only when present in context.

┌────────────────────────────────────────────────────────────────────────┐
│  Slide sets most relevant to these sections                           │
│  • Lecture 8  – simulation in manufacturing                            │
│  • Lecture 5  – KPI hierarchy & unit-process metrics                   │
│  • Lecture 4  – life-cycle costing & environmental impact              │
│  • Lecture 10 – smart-manufacturing “ten priority actions”             │
└────────────────────────────────────────────────────────────────────────┘

**Write the following sections – aim for ≈ 400–500 words EACH – and label them
exactly as shown.  Start each with `## <section title>` (Markdown H-2).**

7. Simulation & virtual validation  
   • Explain the rationale for discrete-event simulation.  
   • Sketch a SIM-model swim-lane or text diagram.  
   • List KPIs to test and add a bullet list of risks/benefits.

8. Quality & performance metrics  
   • Build a three-tier KPI table (process, product, sustainability) based on Lecture 5.  
   • Mention gauge R&R or similar validation where cited.

9. Environmental & cost impact  
   • Provide an energy/CO₂-per-unit chart description and a cost-break-down structure.  
   • Summarise key life-cycle-cost drivers and note “win-win” trade-offs.

10. Implementation roadmap / Smart-mfg priorities  
    • Draft a phased Gantt-style narrative with migration milestones.  
    • Align each phase to the “ten priority actions” slide (Lecture 10).  
    • Include a brief stakeholder-responsibility matrix.

Citation rules  
• After every factual statement, cite its source: `(Lecture #)` or `(URL n)`  
  where *n* is the URL’s order in CONTEXT.  
• If a detail is missing, mark “TBD” — do **not** invent data or sources.

---
CONTEXT
{context}
---

Question → "{question}"

Answer:
""")

question = (
    "Task: Draft a detailed section 7,8,9,10 of a manufacturing guide (≈ 1000 words)\n\n"
    "7. **Simulation & virtual validation**   \n\n"
    "8. **Quality & performance metrics**  \n\n"
    "9. **Environmental & cost impact**  \n\n"
    "10. **Implementation roadmap / Smart-mfg priorities**  \n\n"
    "Ground rules:\n"
    "1. The **Object description** below is the object being manufactured.\n"
    "2. All other statements MUST be supported by the CONTEXT block.\n"
    "3. Do not fabricate data or references. \n\n"
    f"Object description: {object_desc}"
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=combined_retriever,
    chain_type="stuff",
    chain_type_kwargs={
        "prompt": third_prompt,
        "document_variable_name": "context",
    },
    input_key="question",          #  <<< match the prompt
    return_source_documents=True,
)

result = qa_chain({"question": question})


In [81]:
print(result["result"])

Okay, here’s a draft of sections 7-10 of a manufacturing guide for Rubik's Cubes, aiming for approximately 1000 words and incorporating the provided context.

---

**Manufacturing Guide: Rubik’s Cube Production**

**Section 7: Simulation & Virtual Validation (Approx. 250 words)**

Before commencing physical production, a rigorous simulation and virtual validation phase is crucial. This phase leverages CAD (Computer-Aided Design) software and finite element analysis (FEA) to identify potential weaknesses and optimize the cube’s structural integrity. The simulation process begins with a detailed 3D model of each component – the core, the side pieces, and the stickers – generated from the provided context.  Specifically, the simulation will focus on stress distribution under rotational forces, mimicking the cube's intended use. 

The simulation will test various rotational speeds and forces, mirroring the expected user experience.  The context highlights the use of Nylon for the core, ABS