In [4]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader

In [26]:
loader = TextLoader('database_tables.json')
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print(len(docs))


In [27]:
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(docs, embeddings)

In [28]:
print(len(docs))
query = "camareros"
docs = db.similarity_search(query)
print(len(docs))

58
4


In [29]:
docs_and_scores = db.similarity_search_with_score(query)
print(docs_and_scores[0])

(Document(page_content='"camareros": "(\\"ID\\" integer NOT NULL PRIMARY KEY AUTOINCREMENT, \\"Nombre\\" varchar(100) NOT NULL, \\"Apellidos\\" varchar(100) NOT NULL, \\"Email\\" varchar(50) NULL, \\"Pass\\" varchar(100) NULL,', metadata={'source': 'database_tables.json'}), 0.27137944)


In [30]:
db.save_local("faiss_index")
new_db = FAISS.load_local("faiss_index", embeddings)
docs = new_db.similarity_search(query)
print(docs[0])

page_content='"camareros": "(\\"ID\\" integer NOT NULL PRIMARY KEY AUTOINCREMENT, \\"Nombre\\" varchar(100) NOT NULL, \\"Apellidos\\" varchar(100) NOT NULL, \\"Email\\" varchar(50) NULL, \\"Pass\\" varchar(100) NULL,' metadata={'source': 'database_tables.json'}


In [11]:
db1 = FAISS.from_texts(["foo"], embeddings)
db2 = FAISS.from_texts(["bar"], embeddings)
print(db1.docstore._dict)



{'649197f4-94b4-4e8f-8406-61f58c962d9c': Document(page_content='foo', metadata={})}


In [12]:
print(db1.docstore)


<langchain.docstore.in_memory.InMemoryDocstore object at 0x7f120e4a5930>


In [3]:
%pip install -U scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting scipy>=1.3.2
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.2 scipy-1.10.1 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Initialize the embeddings
embeddings = OpenAIEmbeddings()



# Convert the text to embeddings
embedding_vector1 = embeddings.embed_query("vaca")
embedding_vector2 = embeddings.embed_query("cow")

# Supongamos que embedding1 y embedding2 son tus dos vectores de embedding
cosine_sim = cosine_similarity([embedding_vector1], [embedding_vector2])
euclidean_dist = euclidean_distances([embedding_vector1], [embedding_vector2])

print("Cosine Similarity: ", cosine_sim)
print("Euclidean Distance: ", euclidean_dist)


Cosine Similarity:  [[0.82063711]]
Euclidean Distance:  [[0.59893719]]
