In [9]:
# requirements for this example:
# %pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface langchain-milvus

In [13]:
import sys
print(sys.path)

['/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/richw/.local/lib/python3.10/site-packages', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages']


In [2]:
from typing import Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument

from docling.document_converter import DocumentConverter

class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: str | list[str]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)

ModuleNotFoundError: No module named 'docling'

In [3]:
#FILE_PATH = "https://raw.githubusercontent.com/DS4SD/docling/main/tests/data/2206.01062.pdf"  # DocLayNet paperb
#FILE_PATH = "https://resources.saylor.org/wwwresources/archived/site/wp-content/uploads/2010/11/The-Cardiovascular-System.pdf"
##FILE_PATH = "https://pmc.ncbi.nlm.nih.gov/articles/PMC8462890/"
# FILE_PATH =  "https://sedl.org/afterschool/toolkits/science/pdf/ast_sci_data_tables_sample.pdf"
FILE_PATH =  "/home/richw/Automatic_Taxonomy_Construction_NNs/data/raw/AlexNet.pdf"


In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DoclingPDFLoader(file_path=FILE_PATH)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

In [5]:
docs = loader.load()
splits = text_splitter.split_documents(docs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


In [6]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)

In [10]:
from tempfile import TemporaryDirectory
import os

from langchain_milvus import Milvus

MILVUS_URI = os.environ.get(
    "MILVUS_URI", f"{(tmp_dir := TemporaryDirectory()).name}/milvus_demo.db"
)

# Specify index_params with a supported index type
index_params = {
    "index_type": "IVF_FLAT",  # or "FLAT" or "AUTOINDEX"
    "metric_type": "L2",
    "params": {"nlist": 1024},  # Adjust as needed
}

vectorstore = Milvus.from_documents(
    splits,
    embeddings,
    connection_args={"uri": MILVUS_URI},
    drop_old=True,
    index_params=index_params  # Pass index_params here
)

In [12]:
from langchain_huggingface import HuggingFaceEndpoint

HF_API_KEY = os.environ.get("HF_API_KEY")
HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

llm = HuggingFaceEndpoint(
    repo_id=HF_LLM_MODEL_ID,
    huggingfacehub_api_token=HF_API_KEY,
)

In [13]:
from typing import Iterable

from langchain_core.documents import Document as LCDocument
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs: Iterable[LCDocument]):
    return "\n\n".join(doc.page_content for doc in docs)


retriever = vectorstore.as_retriever()

prompt = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {question}\nAnswer:\n"
)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [14]:
rag_chain.invoke("What methods does this architecture use to combat overfitting?")
#rag_chain.invoke("display the table in Example 4:Automobile Land Speed Records (GR 5-10)")
#rag_chain.invoke("from Example 4:Automobile Land Speed Records (GR 5-10), what is the Speed (mph) for Driver Richard Noble")

'1. Data Augmentation\n2. Dropout in the first two fully-connected layers\n3. Small amount of weight decay (0.0005)\n\nThe architecture uses data augmentation, dropout in the first two fully-connected layers, and a small amount of weight decay (0.0005) to combat overfitting.'

In [15]:
for doc in docs:  # Assuming 'docs' is the result of loader.load()
       print(doc.page_content)
       print("---")  # Separator between documents

## ImageNet Classification with Deep Convolutional Neural Networks

| Alex Krizhevsky       | Ilya Sutskever        | Geoffrey E. Hinton    |
|-----------------------|-----------------------|-----------------------|
| University of Toronto | University of Toronto | University of Toronto |
| kriz@cs.utoronto.ca   | ilya@cs.utoronto.ca   | hinton@cs.utoronto.ca |

## Abstract

We trained a large, deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 37.5% and 17.0% which is considerably better than the previous state-of-the-art. The neural network, which has 60 million parameters and 650,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and three fully-connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU 