# LangChain: Chat with your data

## Document loaders

In [1]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv("secrets.env", raise_error_if_not_found=True))

openai.api_key = os.environ["OPENAI_API_KEY"]

### PDF loader

In [2]:
# import the loader
from langchain.document_loaders import PyPDFLoader

# load the document
loader = PyPDFLoader(file_path="Data/Generative AI design for building structures.pdf")
pages = loader.load()
firstPage = pages[0]

# see what's inside
print(firstPage.page_content[:500])
print("")

# see the metadata of this first page
for k, v in firstPage.metadata.items():
	print(f"{k}: {v}") 

Automation in Construction 157 (2024) 105187
Available online 11 November 2023
0926-5805/© 2023 Elsevier B.V. All rights reserved.Review 
Generative AI design for building structures 
Wenjie Liaoa, Xinzheng Lua,*, Yifan Feib, Yi Gub, Yuli Huanga 
aKey Laboratory of Civil Engineering Safety and Durability of Ministry of Education, Tsinghua University, Beijing 100084, China 
bBeijing Engineering Research Center of Steel and Concrete Composite Structures, Tsinghua University, Beijing 100084, China 

source: Data/Generative AI design for building structures.pdf
page: 0


### YouTube Loader

Something is wrong with yt_dlp

In [None]:
'''
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

# url of the video
# url="https://www.youtube.com/watch?v=jGwO_UgTS7I"
url="https://youtu.be/eHzoTLwx01E?si=xLkpwI4rvuW1FEpJ"

# local path to save audio
path = "Data/youtube/"

# build the generic loader with the youtube loader and the parser
loader = GenericLoader(
	blob_loader=YoutubeAudioLoader(
		urls=[url], #NB: urls argument must be a list
		save_dir=path
	),
	blob_parser=OpenAIWhisperParser()
)

# load the youtube audio
audioDoc = loader.load()

# see what it understood
print(audioDoc[:500])
'''

In [3]:
from langchain.document_loaders import WebBaseLoader

# url for the loader
url = "https://github.com/basecamp/handbook/blob/master/37signals-is-you.md"

# create the loader
loader = WebBaseLoader(web_path=url)

# load the page content
webDoc = loader.load()

# print the page content
# print(webDoc[0].page_content[0:500])


## Document splitting

In [4]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv("secrets.env", raise_error_if_not_found=True))

openai.api_key = os.environ["OPENAI_API_KEY"]

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

chunk_size = 26
chunk_overlap = 10

recursiveSplitter = RecursiveCharacterTextSplitter(
	chunk_size = chunk_size,
	chunk_overlap = chunk_overlap,
)

characterSplitter = CharacterTextSplitter(
	chunk_size = chunk_size,
	chunk_overlap = chunk_overlap,
	separator = ""
)

# first example
text1 = 'abcdefghijklmnopqrstuvwxyz'
print(f"recursive splitting from text1 = {recursiveSplitter.split_text(text1)}")
print(f"character splitting from text1 = {characterSplitter.split_text(text1)}\n")

# second example
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
print(f"recursive splitting from text2 = {recursiveSplitter.split_text(text2)}")
print(f"character splitting from text2 = {characterSplitter.split_text(text2)}\n")

# third example
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
print(f"recursive splitting from text3 = {recursiveSplitter.split_text(text3)}")
print(f"character splitting from text3 = {characterSplitter.split_text(text3)}")


recursive splitting from text1 = ['abcdefghijklmnopqrstuvwxyz']
character splitting from text1 = ['abcdefghijklmnopqrstuvwxyz']

recursive splitting from text2 = ['abcdefghijklmnopqrstuvwxyz', 'qrstuvwxyzabcdefg']
character splitting from text2 = ['abcdefghijklmnopqrstuvwxyz', 'qrstuvwxyzabcdefg']

recursive splitting from text3 = ['a b c d e f g h i j k l m', 'i j k l m n o p q r s t u', 'q r s t u v w x y z']
character splitting from text3 = ['a b c d e f g h i j k l m', 'i j k l m n o p q r s t u', 'q r s t u v w x y z']


In [7]:
# More detailled regarding document splitting

some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

recursiveSplitter = RecursiveCharacterTextSplitter(
	chunk_size = 150,
	chunk_overlap = 0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
    # separators=["\n\n", "\n", ". ", " ", ""]
)

[print(f"Block: {txt}\n") for txt in recursiveSplitter.split_text(some_text)]


Block: When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,

Block: closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.

Block: Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this

Block: string. Sentences have a period at the end, but also, have a space.and words are separated by space.


[None, None, None, None]

## Token splitting

In [12]:
from langchain.text_splitter import TokenTextSplitter

textSplitter1 = TokenTextSplitter(
	chunk_size=1,
	chunk_overlap=0
)

text1 = "coucou c'est moi!"

textSplitter1.split_text(text1)

textSplitter2 = TokenTextSplitter(
	chunk_size = 10,
	chunk_overlap = 0
)

textSplitter2.split_text(text1)

["coucou c'est moi!"]

In [15]:
docs = textSplitter2.split_documents(pages)
docs[0].metadata

{'source': 'Data/Generative AI design for building structures.pdf', 'page': 0}

## Context aware splitting

In [3]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

markdownDoc = '''# Artificial Intelligence

## Interesting Articles

### AI for Architecture

- [7 Top AI Tools for Generating Smart Architectural Plans](https://architizer.com/blog/practice/tools/top-ai-tools-for-generating-architectural-plans/)

## Interesting AI tools

- perplexity.ai : give concise answers to question while citing source
- lumalabs: create 3d représentation based on a video
- scispace: use to get answer only based on research papers content. can also be used to help understand paper (like summarize complex topics)
- google SGE: similar to perplexity, but from google
- '''

headerToSplittOn = [
	("#", "Header 1"),
	("##", "Header 2"),
	("###", "Header 3"),
]

markdownSplitter = MarkdownHeaderTextSplitter(
	headers_to_split_on = headerToSplittOn
)

splittedMD = markdownSplitter.split_text(markdownDoc)