# Document Transformers and Embeddings

In [1]:
import os
from langchain.text_splitter import CharacterTextSplitter

In [2]:
with open('some_data/FDR_State_of_Union_1944.txt') as file:
    speech_text = file.read()

In [5]:
print(f"Number of Characters - {len(speech_text)}")
print(f"Number of Words - {len(speech_text.split())}")

Number of Characters - 21927
Number of Words - 3750


## Split by Character

In [6]:
text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=1000)
texts = text_splitter.create_documents([speech_text])
print(type(texts))
print("\n")
print(texts[0]) # display the first chunk

<class 'list'>


page_content='This Nation in the past two years has become an active partner in the world's greatest war against human slavery.

We have joined with like-minded people in order to defend ourselves in a world that has been gravely threatened with gangster rule.

But I do not think that any of us Americans can be content with mere survival. Sacrifices that we and our allies are making impose upon us all a sacred obligation to see to it that out of this war we and our children will gain something better than mere survival.

We are united in determination that this war shall not be followed by another interim which leads to new disaster- that we shall not repeat the tragic errors of ostrich isolationism—that we shall not repeat the excesses of the wild twenties when this Nation went for a joy ride on a roller coaster which ended in a tragic crash.'


In [13]:
print(f"Total Number of Chunks - {len(texts)}")
print(f"Length of the first chunk - {len(texts[0].page_content)}")
print("\n")
print(texts[0].page_content)
print("\n")
print(f"Type of each of the chunk - {type(texts[0])}")

Total Number of Chunks - 28
Length of the first chunk - 841


This Nation in the past two years has become an active partner in the world's greatest war against human slavery.

We have joined with like-minded people in order to defend ourselves in a world that has been gravely threatened with gangster rule.

But I do not think that any of us Americans can be content with mere survival. Sacrifices that we and our allies are making impose upon us all a sacred obligation to see to it that out of this war we and our children will gain something better than mere survival.

We are united in determination that this war shall not be followed by another interim which leads to new disaster- that we shall not repeat the tragic errors of ostrich isolationism—that we shall not repeat the excesses of the wild twenties when this Nation went for a joy ride on a roller coaster which ended in a tragic crash.


Type of each of the chunk - <class 'langchain_core.documents.base.Document'>


## Split by Token

In [16]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500) # Now chunk size is hard length based on tokens
texts = text_splitter.split_text(speech_text)

print(f"Total Number of Chunks - {len(texts)}")
print(f"Length of the first chunk - {len(texts[0])}")
print("\n")
print(texts[0])
print("\n")
print(f"Type of each of the chunk - {type(texts[0])}")

Total Number of Chunks - 15
Length of the first chunk - 2332


This Nation in the past two years has become an active partner in the world's greatest war against human slavery.

We have joined with like-minded people in order to defend ourselves in a world that has been gravely threatened with gangster rule.

But I do not think that any of us Americans can be content with mere survival. Sacrifices that we and our allies are making impose upon us all a sacred obligation to see to it that out of this war we and our children will gain something better than mere survival.

We are united in determination that this war shall not be followed by another interim which leads to new disaster- that we shall not repeat the tragic errors of ostrich isolationism—that we shall not repeat the excesses of the wild twenties when this Nation went for a joy ride on a roller coaster which ended in a tragic crash.

When Mr. Hull went to Moscow in October, and when I went to Cairo and Teheran in November, we 

The key difference between using CharacterTextSplitter.from_tiktoken_encoder() and CharacterTextSplitter in LangChain lies in how they determine chunk size:

CharacterTextSplitter:

Splits text into chunks based on the number of characters.
chunk_size parameter directly controls the number of characters in each chunk.
Simple and straightforward, but might not be the most efficient for language models.
CharacterTextSplitter.from_tiktoken_encoder():

Splits text into chunks based on the number of tokens, as determined by the specified tiktoken encoder.
chunk_size parameter now refers to the maximum number of tokens allowed in a chunk.
More accurate for language models as they operate on tokens, not characters. This ensures chunks are more consistent with model limitations.
In essence:

Use CharacterTextSplitter if you want to split text by a fixed number of characters.
Use CharacterTextSplitter.from_tiktoken_encoder() if you want to split text by a fixed number of tokens, which is generally more suitable for optimal performance with language models.

## Text Embeddings

In [20]:
import os
from langchain_openai import OpenAIEmbeddings

api_key = os.getenv("OPENAI_API_KEY")

In [22]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

text = "Some normal text to send to OpenAI to be embedded into a N dimensional vector"
embedded_text = embeddings.embed_query(text=text)
type(embedded_text)

list

In [25]:
print(f"Length of the embedded text - {len(embedded_text)}")

Length of the embedded text - 1536


## Embed Documents

In [26]:
from langchain.document_loaders import CSVLoader

loader = CSVLoader('some_data/penguins.csv')
data = loader.load()

In [27]:
print(f"Type of data - {type(data)}")
print(f"Type of every element in the data - {type(data[0])}")

Type of data - <class 'list'>
Type of every element in the data - <class 'langchain_core.documents.base.Document'>


In [28]:
# Now we embed each of the document
embedded_docs = embeddings.embed_documents([text.page_content for text in data])

In [31]:
print(f"Length of the embeded_docs - {len(embedded_docs)}")
print(f"Length of each of the item in the embedded docs - {len(embedded_docs[0])}")
print(f"Type of the embedded_docs - {type(embedded_docs)}")

Length of the embeded_docs - 344
Length of each of the item in the embedded docs - 1536
Type of the embedded_docs - <class 'list'>
