# Embeddings

### Top Down approach.
> We will see the bigger picture of embeddings and then dive into details of each of the modules

In [1]:
# if any import fails, pip install <module>
import os
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings

### A very simple Document to search. 
> The document has details about the investment policy for bank ICICI

In [2]:
#Load data
import json
# Just read about json structure - https://www.w3schools.com/js/js_json_intro.asp
with open('icici_policy.json') as f:
   data = json.load(f)

In [3]:
data

{'RateOfReturns': 'ICICI Investment Company aims to achieve an average annual rate of return between 6% and 8% for clients over the long term.',
 'InvestmentPortfolio': 'We invest in equity stocks, fixed-income securities, and alternative investments to provide diversification and growth opportunities.',
 'ChargesAndFees': 'ManagementFees: Annual fee based on a percentage of assets under management (AUM). PerformanceFees: Fees based on gains above a specified benchmark for certain investment strategies.',
 'Eligibility': 'Any person over the age of 18 with a minimum initial investment of $5,000 is eligible. Corporations and legal entities must meet our corporate account application requirements. Institutional clients may engage our services based on their specific needs and investment guidelines.'}

In [4]:
#one can access the heading of the above document as
list(data.keys())

['RateOfReturns', 'InvestmentPortfolio', 'ChargesAndFees', 'Eligibility']

In [5]:
#one can access the content of each document by
list(data.values())

['ICICI Investment Company aims to achieve an average annual rate of return between 6% and 8% for clients over the long term.',
 'We invest in equity stocks, fixed-income securities, and alternative investments to provide diversification and growth opportunities.',
 'ManagementFees: Annual fee based on a percentage of assets under management (AUM). PerformanceFees: Fees based on gains above a specified benchmark for certain investment strategies.',
 'Any person over the age of 18 with a minimum initial investment of $5,000 is eligible. Corporations and legal entities must meet our corporate account application requirements. Institutional clients may engage our services based on their specific needs and investment guidelines.']

### Load model
> In AI/ML "model" is defined as something that takes some input -> Process/Analyse -> Give some insights/predictions output.


In [11]:
#Dont worry about this whole section, we will come back to such models in detail when we revisit back
from langchain.embeddings import HuggingFaceEmbeddings
model_name = "BAAI/bge-small-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} 
bge_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    multi_process=True
)
model = bge_embeddings.embed_documents

> The above is a model, that takes text input and gives out some numbers.
> Note: AI (ML) can only process numbers, so we need to convert them.  
> But these numbers have special meaning, which we will cover next. 

In [14]:
# if you pass the text dog to the model it outputs a array of 384 numbers
model(['dog'])

[[-0.4804138243198395,
  -0.6583849191665649,
  0.43524760007858276,
  -0.23052063584327698,
  -0.1805112063884735,
  0.10275222361087799,
  0.31086474657058716,
  0.26580098271369934,
  0.2078389674425125,
  0.3304010033607483,
  0.025775734335184097,
  -1.023865818977356,
  0.1734735667705536,
  0.32454586029052734,
  0.1383545696735382,
  -0.13046517968177795,
  0.573422908782959,
  0.3792475759983063,
  -1.0564801692962646,
  0.17084535956382751,
  0.261990487575531,
  -0.33145102858543396,
  -0.18109270930290222,
  -0.5667657852172852,
  -0.12710076570510864,
  0.27342069149017334,
  -0.16510488092899323,
  -0.017760008573532104,
  -0.1707301139831543,
  -0.9235773682594299,
  0.088602714240551,
  -0.15321002900600433,
  0.08092811703681946,
  0.09473992884159088,
  0.055353812873363495,
  -0.12070296704769135,
  0.17661041021347046,
  0.22277523577213287,
  -0.18887118995189667,
  0.34006938338279724,
  0.5352292060852051,
  -0.07180988043546677,
  -0.5368436574935913,
  -0.47078

In [19]:
len(model(['dog'])[0])

384

In [20]:
# same thing but with a different input
model(['puppy'])

[[-0.3395482897758484,
  -0.5758752226829529,
  0.6420542001724243,
  -0.16036367416381836,
  -0.05526722967624664,
  0.36452025175094604,
  0.4063061773777008,
  0.2534646689891815,
  0.17506824433803558,
  0.33891987800598145,
  0.01211346685886383,
  -0.8953868746757507,
  0.24685680866241455,
  0.43662571907043457,
  -0.048000723123550415,
  -0.2218220978975296,
  0.2969718277454376,
  0.31473803520202637,
  -0.9335151314735413,
  0.14872023463249207,
  0.48798367381095886,
  -0.3315374255180359,
  -0.37211742997169495,
  -0.5851240158081055,
  -0.1619105339050293,
  0.17509710788726807,
  -0.443093866109848,
  -0.10572026669979095,
  -0.336054265499115,
  -0.8408039212226868,
  0.0241989828646183,
  -0.14137671887874603,
  0.3090711832046509,
  0.09900540113449097,
  -0.016530081629753113,
  -0.04060970991849899,
  0.09887705743312836,
  0.4382137060165405,
  -0.18836233019828796,
  0.11868980526924133,
  0.3019285798072815,
  0.019770776852965355,
  -0.6287757754325867,
  -0.5603

> We said these numbers are special. Why?. Because we know dog and puppy means similar.
> So the numbers also should be similar.
> But how to tell numbers are similar? We introduce similarity metric

#### Similarity metric is a tool that can tell two vectors are similar. 
> We will study this in detail the next class, but just assume there is some tool that wil tell you two numbers are similar   
> We will use cosine similarity - It ranges from 0 to 1. Close to 1 means similar


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
#Now lets check dog and puppy
cosine_similarity(model(['puppy']), model(['dog']))

array([[0.94284609]])

> The above is close to 1, so the model is saying the two text are very similar

In [34]:
#but for other things like pizza they are not close
cosine_similarity(model(['pizza']), model(['dog']))

array([[0.84226208]])

### Now lets go back to the document

> We will pass the 4 content of the documents to the above model and get numbers for each content

In [35]:
embeddings = model(list(data.values()))

In [36]:
len(embeddings)

4

> Lets say a customer comes to ICICI website and wants to know about the investment policy, he will say type:


In [37]:
query = 'Can you tell me the investment returns of your policy'

In [38]:
#Now the query needs to be converted to numbers and then comapred with the 4 numbers of the document.
# Which ever number is closer to 1, that is the selected

In [43]:
query_embed = model([query])

In [51]:
#We check which 4 section in the document is matching the query
for idx, section in enumerate(embeddings):
    print(idx, cosine_similarity([section], query_embed))

0 [[0.83240203]]
1 [[0.82541288]]
2 [[0.81347079]]
3 [[0.81501587]]


> We see that the first one is the max and it points to RateOfReturns which is what the user asked

In [52]:
list(data.keys())

['RateOfReturns', 'InvestmentPortfolio', 'ChargesAndFees', 'Eligibility']

In [53]:
query = 'I have 10,000 to invest, can i invest?'

In [54]:
query_embed = model([query])
for idx, section in enumerate(embeddings):
    print(idx, cosine_similarity([section], query_embed))

0 [[0.77379655]]
1 [[0.81254974]]
2 [[0.75742801]]
3 [[0.83468744]]


> The last on is the max and it points to 'Elibility'.
> So based on what the user typed, it picks the right section, even though the words are not exact. 

### TO BE CONTINUED