### Embedding Techniques - Convert text into vectors
* OpenAI (paid service)
* OLLama (Open source)
* Huggingface (Opensource)

In [14]:
### Import dotenv, os 

import os 
from dotenv import load_dotenv
load_dotenv()   # to load all the environment variables 

True

In [15]:
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")

In [16]:
from langchain_openai import OpenAIEmbeddings
embeddings=OpenAIEmbeddings(model="text-embedding-3-large")
embeddings


OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000002054E6EC8C0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000002054E6ECD40>, model='text-embedding-3-large', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [17]:
text="Hello World"
query_result=embeddings.embed_query(text)
query_result

[0.0007678096299059689,
 -0.013973488472402096,
 0.002776175504550338,
 0.04117957875132561,
 0.015369114466011524,
 0.002935552503913641,
 0.0007559640216641128,
 0.0741576999425888,
 0.027378389611840248,
 0.03475280851125717,
 -0.0010838714661076665,
 0.00421487633138895,
 -0.014231937937438488,
 -0.015170970000326633,
 -0.010441348887979984,
 0.03273690491914749,
 -0.02710271067917347,
 -0.004884690511971712,
 -0.010811792686581612,
 -0.014395622536540031,
 0.03797480836510658,
 0.009915836155414581,
 -0.02844664640724659,
 0.0454181469976902,
 -0.004755465779453516,
 0.016514906659722328,
 -0.0033749162685126066,
 0.0069393618032336235,
 -0.0207793191075325,
 -0.03273690491914749,
 0.004354869481176138,
 0.013206755742430687,
 0.00889065396040678,
 0.020003970712423325,
 0.03387407958507538,
 0.0011630215449258685,
 0.036941010504961014,
 0.01482637133449316,
 0.004165340214967728,
 0.0039198133163154125,
 0.033839620649814606,
 0.023760098963975906,
 -0.011432071216404438,
 0.034

In [18]:
len(query_result)

3072

In [19]:
### to convert own dimensions instead of OpenAI

embeddings_1024=OpenAIEmbeddings(model="text-embedding-3-large",dimensions=1024)

In [20]:
text="Hello World"
query_result=embeddings_1024.embed_query(text)
len(query_result)

1024

In [None]:
### Vector Embeddings and Vector StoreDB (both steps together)

## 1. Load the PDF document with PyPDFLoader

from langchain_community.document_loaders import PyPDFLoader
loader=PyPDFLoader('attention_is_all_you_need.pdf')
docs=loader.load()
docs

## 2. split the document with RecursiveCharacterTextSplitter

from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
final_documents=text_splitter.split_documents(docs)
final_documents

## 3. Loading vectors into ChromaDB
from langchain_community.vectorstores import Chroma
db=Chroma.from_documents(final_documents,embeddings_1024)
db




[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'attention_is_all_you_need.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu'),
 Document(metadata=

In [26]:
from langchain_community.vectorstores import Chroma

db=Chroma.from_documents(final_documents,embeddings_1024)
db

<langchain_community.vectorstores.chroma.Chroma at 0x2055f4032f0>

In [None]:
### Retrive the results from querying vectorstoredb by applying similarity search

query="Ashish Vaswani"
retrieved_results=db.similarity_search(query)
print(retrieved_results)

[Document(metadata={'source': 'attention_is_all_you_need.pdf', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'moddate': '2024-04-10T21:11:43+00:00', 'author': '', 'creator': 'LaTeX with hyperref', 'trapped': '/False', 'creationdate': '2024-04-10T21:11:43+00:00', 'keywords': '', 'title': '', 'total_pages': 15, 'producer': 'pdfTeX-1.40.25', 'page': 0, 'page_label': '1', 'subject': ''}, page_content='the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and\nhas been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head\nattention and the parameter-free position representation and became the other person involved in nearly every\ndetail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and'), Document(metadata={'page_label': '1', 'subject': '', 'keywords': '', 'creationdate': 