In [1]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

True

## Load document

In [2]:
loader = TextLoader('state_of_the_union.txt', encoding='utf-8')
documents = loader.load()

print(documents)  # prints the document objects
print(len(documents))  # 1 - we've only read one file/document into the loader

[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n\nGroups of citize

## Split document into chunks

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
texts = text_splitter.split_documents(documents)

print(texts)
print(len(texts))

[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.', metadata={'source'

In [4]:
print(texts[0])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.' metadata={'source': 'state_of

## Embedding query by OpenAIEmbedding

In [5]:
embeddings = OpenAIEmbeddings()

vector = embeddings.embed_query('Testing the embedding model')

print(len(vector))  # 1536 dimensions

1536


## Embedding document pages

In [6]:
doc_vectors = embeddings.embed_documents([t.page_content for t in texts[:5]])

print(len(doc_vectors))  # 5 vectors in the output
print(doc_vectors[0])    # this will output the first chunk's 1539-dimensional vector

5
[-0.003596715552129443, -0.010378713756902016, -0.01855427425725525, -0.01791721800612337, 0.005879501771392206, 0.020080555363391726, 0.014944286661088789, -0.009562485027685707, -0.00304095001775157, -0.0065862363119260844, 0.0153159029628028, 0.008898884222818546, -0.020996324772131605, 0.0008809298565852889, 0.0019493268986360386, 0.006334068173714612, 0.0182622897541021, -0.014758478510231784, 0.029623131510114722, -0.022336797727411074, 0.009841197253971215, -0.01312602012047667, 0.015368991138950872, 0.01243587569319672, -0.0005752588126774586, 0.011128582964425107, 0.03694928491738878, -0.03405598630223755, 0.019151514609106697, -0.024513408292869566, -0.01239605979391629, -0.029702765171320575, -0.02395598384029855, -0.004754698425768818, -0.024924840493864005, -0.021049412016957186, -0.0133516451019366, -0.012303155718487786, 0.0036431675898436944, -0.018527729703519968, 0.007372603116682071, -0.002374031376498122, -0.011115310687557466, 0.0057865976959637026, -0.0247257591

## Connect, save embedding vector to PostgreSQL database

In [7]:
from langchain.vectorstores.pgvector import PGVector

CONNECTION_STRING = "postgresql+psycopg2://postgres:123456@localhost:5432/vector_db"
COLLECTION_NAME = 'state_of_union_vectors'

db = PGVector.from_documents(
    embedding=embeddings,
    documents=texts,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [8]:
db

<langchain_community.vectorstores.pgvector.PGVector at 0x26dffee1780>

## Similarity search

In [11]:
query = "What did the president say about Russia"
similar = db.similarity_search_with_score(query, k=2)

for doc in similar:
    print(doc, end="\n\n")

(Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.', metadata={'source'

In [9]:
vector = embeddings.embed_query(query)
print(vector)

[-0.020203065393189588, -0.01177435647779177, 0.005228382587756116, -0.003704110333406159, -0.0036911928903170708, -0.01228460013642327, -0.013795954947684137, -0.019117988997836143, -0.010876585801197611, -0.005964683363091666, 0.03903686691777578, 0.006071253199899147, -0.00693027108156479, -0.03014958597874782, 0.013692614471648929, -0.00288061588589676, 0.027617744781546452, -0.03456739063076112, 0.05487379743130842, -0.015953187501334436, -0.004385511975613083, -0.012807761238143858, 0.003213243072238917, -0.007317798099527448, -0.016301960327384825, -0.0010180651802684836, 0.026067636709695818, -0.024853385883451488, 0.0007601177184846517, -0.012956312764990876, 0.008357662047085332, -0.02808277645804364, -0.02155940809341389, -0.006281163978403087, -0.020616427297330424, -0.027281887535940148, -0.00739530322372323, -0.020500168446883626, 0.009610665202596926, -0.012930477878812699, 0.022476555865964185, 0.0070013179498772775, 0.01636654940547527, -0.0038914151208429442, -0.01063

## Run SQL in notebook

In [None]:
pip install ipython-sql
pip install sqlalchemy