In [2]:
import requests

In [3]:
res = requests.get("https://python.langchain.com/en/latest/")
res

<Response [200]>

In [4]:
from bs4 import BeautifulSoup
import urllib.parse
import html
import re

In [5]:
domain = "https://python.langchain.com/"
domain_full = domain+"en/latest/"

In [6]:
soup = BeautifulSoup(res.text, 'html.parser')

# Find all links to local pages on the website

local_links = []
for link in soup.find_all('a',href=True):
    href=link['href']
    if href.startswith(domain) or href.startswith('./') \
        or href.startswith('/') or href.startswith('modules') \
        or href.startswith('user_cases'):
        local_links.append(urllib.parse.urljoin(domain_full,href))

# Find the main content using CSS selectors
main_content = soup.select('body main')[0]

# Extract the HTML code of the main content
main_content_html = str(main_content)

# Extract the plaintext of the main content
main_content_text = main_content.get_text()

# Remove all HTML tags
main_content_text = re.sub(r'<[^>]+>','',main_content_text)

# Remove extract white space
main_content_text = ' '.join(main_content_text.split())

# Replace HTML entities with their corresponding characters
main_content_text = html.unescape(main_content_text)

print(main_content_text)

.rst .pdf Welcome to LangChain Contents Getting Started Modules Use Cases Reference Docs LangChain Ecosystem Additional Resources Welcome to LangChain# LangChain is a framework for developing applications powered by language models. We believe that the most powerful and differentiated applications will not only call out to a language model via an API, but will also: Be data-aware: connect a language model to other sources of data Be agentic: allow a language model to interact with its environment The LangChain framework is designed with the above principles in mind. This is the Python specific portion of the documentation. For a purely conceptual guide to LangChain, see here. For the JavaScript documentation, see here. Getting Started# Checkout the below guide for a walkthrough of how to get started using LangChain to create an Language Model application. Getting Started Documentation Modules# There are several main modules that LangChain provides support for. For each module we provid

In [7]:
def scrape(url: str):
    res = requests.get(url)
    if res.status_code != 200:
        print(f"{res.status_code} for '{url}'")
        return None
    soup = BeautifulSoup(res.text, 'html.parser')

    # Find all links to local pages on the website
    local_links = []
    for link in soup.find_all('a',href=True):
        href=link['href']
        if href.startswith(domain) or href.startswith('./') \
            or href.startswith('/') or href.startswith('modules') \
            or href.startswith('user_cases'):
            local_links.append(urllib.parse.urljoin(domain_full,href))

    # Find the main content using CSS selectors
    main_content = soup.select('body main')[0]

    # Extract the HTML code of the main content
    main_content_html = str(main_content)

    # Extract the plaintext of the main content
    main_content_text = main_content.get_text()

    # Remove all HTML tags
    main_content_text = re.sub(r'<[^>]+>','',main_content_text)

    # Remove extract white space
    main_content_text = ' '.join(main_content_text.split())

    # Replace HTML entities with their corresponding characters
    main_content_text = html.unescape(main_content_text)

    # Return as JSON
    return {
        "url":url,
        "text":main_content_text
    }, local_links

In [8]:
links = ["https://python.langchain.com/en/latest/"]
scraped = set()
data = []

while True:
    if len(links) == 0:
        print("Complete")
        break
    url = links[0]
    print(url)
    res = scrape(url)
    scraped.add(url)
    if res is not None:
        page_content, local_links = res
        data.append(page_content)
        # add new links to links list
        links.extend(local_links)
        # remove duplicates
        links = list(set(links))
    # remove links 
    links = [link for link in links if link not in scraped]

https://python.langchain.com/en/latest/
https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/powerpoint.html
https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/dataframe.html
https://python.langchain.com/en/latest/use_cases/personal_assistants.html
https://python.langchain.com/en/latest/modules/chains/examples/constitutional_chain.html
https://python.langchain.com/en/latest/modules/agents/tools/examples/python.html
https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html
https://python.langchain.com/en/latest/modules/models/llms/examples/streaming_llm.html
https://python.langchain.com/en/latest/modules/models/llms/examples/llm_serialization.html
https://python.langchain.com/en/latest/modules/indexes/text_splitters.html
https://python.langchain.com/en/latest/modules/models/llms/examples/llm_caching.html
https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/youtube.html
ht

In [9]:
import tiktoken

In [10]:
tokenizer = tiktoken.get_encoding('p50k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 20,
    length_function = tiktoken_len,
    separators=["\n\n", "\n", " ",""]
)

Process the data into more chunks using this approach.

In [12]:
from uuid import uuid4
from tqdm.auto import tqdm

chunks = []

for idx, record in enumerate(tqdm(data)):
    texts = text_splitter.split_text(record['text'])
    chunks.extend([{
        'id': str(uuid4()),
        'text': texts[i],
        'chunk': i,
        'url': record ['url']
    } for i in range(len(texts))])

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 269/269 [00:01<00:00, 153.27it/s]


Our chunks are ready so now we move onto embedding and indexing everything

# Initialize Embedding Model

We use text-embedding-ada-002 as the embedding model. We can embed text like so.

In [13]:
import openai
import os
from dotenv import load_dotenv

load_dotenv()
# environment variables

True

In [14]:
# Initialize openai API key
openai.api_key = os.getenv("OPENAI_API_KEY")

embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=embed_model
)

In the response res we will find a JSON-like object containing our new embeddings within the 'data' field.

In [15]:
res.keys()

dict_keys(['object', 'data', 'model', 'usage'])

Inside 'data' we will find two records, one for each of the two sentences we just embedded. Each vector embedding contains 1536 dimensions - we may just switch to another method of embedding from the other video

In [16]:
len(res['data'])

2

In [17]:
len(res['data'][0]['embedding']), len(res['data'][1]['embedding'])

(1536, 1536)

 We will apply this same embedding logic to the langchain docs dataset we've just scraped. But before doing so we must create a place to store the embeddings.

# Initializing the Index

Now we need to place to store these embeddings and enable a effecient vector search through them all. To do that we use Pinecone, we can get a free API key and enter it below where we will initialize our connection to Pinecone and create a new index.

In [19]:
import pinecone

In [None]:
index_name = 'gpt-4-langchain-docs'

# Initialize connection to pinecone
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY")
    environment="us-central1-gcp"
)

# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='dotproduct'
    )
# Connect to index
index = pinecone.GRPCIndex(index_name)
# view index stats
index.describe_index_state()