# Web scraping

In [1]:
import logging
import os
import re
from parser import langchain_docs_extractor

import weaviate
from bs4 import BeautifulSoup, SoupStrainer
from langchain.document_loaders import RecursiveUrlLoader, SitemapLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.indexes import SQLRecordManager, index
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.utils.html import PREFIXES_TO_IGNORE_REGEX, SUFFIXES_TO_IGNORE_REGEX
from langchain.vectorstores import Weaviate

from constants import WEAVIATE_DOCS_INDEX_NAME
import datetime

logger = logging.getLogger(__name__)

WEAVIATE_URL = os.environ["WEAVIATE_URL"]
WEAVIATE_API_KEY = os.environ["WEAVIATE_API_KEY"]
RECORD_MANAGER_DB_URL = os.environ["RECORD_MANAGER_DB_URL"]

  Base = declarative_base()


In [2]:
# https://github.com/langchain-ai/langchain/issues/8494
!pip install nest-asyncio
import nest_asyncio
nest_asyncio.apply()



In [3]:
from recursive_url_loader import RecursiveUrlLoader2

In [44]:
def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:                                
    title = soup.find("title")                                
    description = soup.find("meta", attrs={"name": "description"})                                
    html = soup.find("html")                                
    return {                                
        "source": meta["loc"],                                
        "title": title.get_text() if title else "",                                
        "description": description.get("content", "") if description else "",                                
        "language": html.get("lang", "") if html else "",                                
        **meta,                                
    }                                
                                
                                
def load_langchain_docs():                                
    return SitemapLoader(                                
        "https://python.langchain.com/sitemap.xml",                                
        filter_urls=["https://python.langchain.com/"],                                
        parsing_function=langchain_docs_extractor,                                
        default_parser="lxml",                                
        bs_kwargs={                                
            "parse_only": SoupStrainer(                                
                name=("article", "title", "html", "lang", "content")                                
            ),                                
        },                                
        meta_function=metadata_extractor,                                
    ).load()                                
                                
                                
def simple_extractor(html: str) -> str:                                
    soup = BeautifulSoup(html, "lxml")                                
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

def simple_extractor2(html: str) -> str:                                
    soup = BeautifulSoup(html, "html.parser")                                
    return re.sub(r"\n\n+", "\n\n", soup.get_text(separator="\n")).strip()
                                
                                
def load_api_docs():                                
    return RecursiveUrlLoader(                                
        url="https://api.python.langchain.com/en/latest/api_reference.html",                                
        max_depth=1,                                
        extractor=simple_extractor,                                
        prevent_outside=True,
        use_async=True,
        timeout=600,
        # Drop trailing / to avoid duplicate pages.
        link_regex=(
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status=True,
        exclude_dirs=(
            "https://api.python.langchain.com/en/latest/_sources",
            "https://api.python.langchain.com/en/latest/_modules",
        ),
    ).load()

def load_wiki_docs():                                
    return RecursiveUrlLoader2(                                
        url="http://wiki.skplanet.com/pages/viewpage.action?pageId=295656385",                                
        max_depth=1,                                
        extractor=simple_extractor,                                
        prevent_outside=True,
        use_async=False,
        timeout=600,
        # Drop trailing / to avoid duplicate pages.
        link_regex=(
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status=True,
        exclude_dirs=(
            "https://api.python.langchain.com/en/latest/_sources",
            "https://api.python.langchain.com/en/latest/_modules",
        ),
    ).load()

In [45]:
def ingest_docs(docs_from_documentation, docs_from_api):
    # docs_from_documentation = load_langchain_docs()
    # logger.info(f"Loaded {len(docs_from_documentation)} docs from documentation")
    # docs_from_api = load_api_docs()
    # logger.info(f"Loaded {len(docs_from_api)} docs from API")

    print("Start docs transform")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=200)
    docs_transformed = text_splitter.split_documents(
        docs_from_documentation + docs_from_api
    )

    # We try to return 'source' and 'title' metadata when querying vector store and
    # Weaviate will error at query time if one of the attributes is missing from a
    # retrieved document.
    for doc in docs_transformed:
        if "source" not in doc.metadata:
            doc.metadata["source"] = ""
        if "title" not in doc.metadata:
            doc.metadata["title"] = ""

    # client = weaviate.Client(
    #     url=WEAVIATE_URL,
    #     auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY),
    # )
    # embedding = OpenAIEmbeddings(
    #     chunk_size=200,
    # )  # rate limit
    # vectorstore = Weaviate(
    #     client=client,
    #     index_name=WEAVIATE_DOCS_INDEX_NAME,
    #     text_key="text",
    #     embedding=embedding,
    #     by_text=False,
    #     attributes=["source", "title"],
    # )

    # record_manager = SQLRecordManager(
    #     f"weaviate/{WEAVIATE_DOCS_INDEX_NAME}", db_url=RECORD_MANAGER_DB_URL
    # )
    # record_manager.create_schema()

    print("Start index")
    indexing_stats = index(
        docs_transformed,
        record_manager,
        vectorstore,
        cleanup="full",
        source_id_key="source",
    )

    print("Indexing stats: ", indexing_stats)
    # print(
    #     "LangChain now has this many vectors: ",
    #     client.query.aggregate(WEAVIATE_DOCS_INDEX_NAME).with_meta_count().do(),
    # )

In [None]:
docs = RecursiveUrlLoader2(                                
        url="http://wiki.skplanet.com/pages/viewpage.action?pageId=295656385",                                
        max_depth=1,                                
        extractor=simple_extractor2,                                
        prevent_outside=True,
        use_async=False,
        timeout=600,
        # Drop trailing / to avoid duplicate pages.
        link_regex=(
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status=True,
        exclude_dirs=(
            "https://api.python.langchain.com/en/latest/_sources",
            "https://api.python.langchain.com/en/latest/_modules",
        ),
    ).load()
print(docs[0].page_content)

In [23]:
docs_from_documentation = load_langchain_docs()
print(f"Loaded {len(docs_from_documentation)} docs from documentation")

Fetching pages: 100%|#################################################################################################################################################################| 1053/1053 [00:23<00:00, 43.95it/s]


Loaded 1053 docs from documentation


In [24]:
docs_from_api = load_api_docs()
print(f"Loaded {len(docs_from_api)} docs from API")

Loaded 1 docs from API


In [25]:
print(docs_from_api[0])



In [27]:
print(docs_from_api[0].page_content)

langchain API Reference — 🦜🔗 LangChain 0.0.339rc1

API

Core

Experimental

Python Docs

Toggle Menu

Prev
Up
Next

LangChain 0.0.339rc1

langchain API Reference
langchain.adapters
Classes
Functions

langchain.agents
Classes
Functions

langchain.agents.format_scratchpad
Functions

langchain.agents.output_parsers
Classes
Functions

langchain.cache
Classes
Functions

langchain.callbacks
Classes
Functions

langchain.chains
Classes
Functions

langchain.chat_loaders
Classes
Functions

langchain.chat_models
Classes
Functions

langchain.docstore
Classes

langchain.document_loaders
Classes
Functions

langchain.document_transformers
Classes
Functions

langchain.embeddings
Classes
Functions

langchain.evaluation
Classes
Functions

langchain.graphs
Classes
Functions

langchain.hub
Functions

langchain.indexes
Classes
Functions

langchain.llms
Classes
Functions

langchain.memory
Classes
Functions

langchain.model_laboratory
Classes

langchain.output_parsers
Classes
Functions

langchain.prompts
Cla

In [18]:
docs_from_wiki = load_wiki_docs()
print(f"Loaded {len(docs_from_wiki)} docs from API")

###load() start
###requests.get start
Loaded 1 docs from API


In [28]:
print(docs_from_wiki[0].page_content)

03. 검색 인프라 - 추천 검색 플랫폼 - Global Site

Skip to main content
assistive.skiplink.to.breadcrumbs
assistive.skiplink.to.header.menu
assistive.skiplink.to.action.menu
assistive.skiplink.to.quick.search

윤태형(TaeHyoung Yun)/커머스플랫폼개발팀/SKP

			                                Personal space
                    

			                                Recently viewed
                    

			                                Recently worked on
                    

			                                User dashboard
                    

			                                Profile
                    

			                                Tasks
                    

			                                Saved for later
                    

			                                Watches
                    

			                                Drafts
                    

			                                Network
                    

			                                Settings
                    

			            

In [121]:
_clear()

In [122]:
now = datetime.datetime.now()
print(now.strftime("%Y-%m-%d %H:%M:%S"))

2023-11-28 15:00:28


In [123]:
docs_selected = [ docs_from_documentaion[0], docs_from_documentaion[1], docs_from_documentaion[2], docs_from_documentaion[3]]

In [124]:
ingest_docs(docs_selected, docs_from_api)

Start docs transform
Start index
Indexing stats:  {'num_added': 20, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}


In [125]:
now = datetime.datetime.now()
print(now.strftime("%Y-%m-%d %H:%M:%S"))

2023-11-28 15:00:34


In [126]:
keys = record_manager.list_keys()
len(keys)

20

In [127]:
docs = vectorstore.similarity_search("initial", k=30)

In [128]:
print(len(docs))
print(docs[0].page_content)

21
Initial document
