In [9]:
import os
import openai
import pandas as pd
import scipy
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from openai.embeddings_utils import cosine_similarity

# Load environment variables (contains OPENAI_API_BASE and OPENAI_API_KEY)
load_dotenv(override=True)

# Configure Azure OpenAI Service API
openai.api_type = "azure"
openai.api_version = "2022-12-01"
openai.api_base = os.getenv('OPENAI_API_BASE') # looks like https://********.openai.azure.com/
openai.api_key = os.getenv("OPENAI_API_KEY")

# Initialize embedding model
embeddings = OpenAIEmbeddings(model = 'text-embedding-ada-002', deployment = 'text-embedding-ada-002',  chunk_size=1)

In [3]:
text = "Testo di prova"

# Embed a single document
e = embeddings.embed_query(text)

print(e)
print(f'lunghezza vettore: {len(e)}')

[-0.011196190477539919, -0.011130600222658195, -0.001703696788985937, -0.022392380955079838, -0.013196680678577776, 0.021356061074677576, -0.016935303786416885, -0.0029613824990454814, -0.01810280231393745, -0.008611949150096634, 0.01070426589423342, 0.009845038863821496, -0.01567597871579739, -0.007083705617710444, 0.01528243904915219, -0.0010600961964569698, 0.02193325010223034, -0.0028629975823841816, 0.012488310209938989, -0.014036231656980012, 0.004076410079946141, 0.01911288683744508, -0.01288184987658419, 0.014652775582519869, -0.01062555796090438, -0.03200785439247659, 0.004332210397604235, -0.01832580750415468, 0.015505444239369422, -0.01833892611392457, 0.03137819092584426, -0.025370160086523177, -0.03061734881209364, -0.007175531229486799, 0.002446501978456179, -0.012940880826580969, -0.01861440388057621, -0.007451008530477153, 0.02109370005515068, 0.001679100559820612, 0.010507496060910818, 0.028334822470841774, 0.017132073619739483, -0.0367303229416384, -0.0126260490932648

In [4]:
def get_similarity_from_text(text1, text2):
    e1 = embeddings.embed_query(text1)
    e2 = embeddings.embed_query(text2)
    s = cosine_similarity(e1, e2)
    print(f"Somiglianza tra '{text1}' e '{text2}': {s}")

get_similarity_from_text("ragazzo", "ragazza")
get_similarity_from_text("ragazzo", "bambino")
get_similarity_from_text("ragazzo", "donna")
get_similarity_from_text("uomo", "cane")

get_similarity_from_text("Italia", "Roma")
get_similarity_from_text("Switzerland ", "Zurich")
get_similarity_from_text("California", "Los Angeles")


Somiglianza tra 'ragazzo' e 'ragazza': 0.9371455305554139
Somiglianza tra 'ragazzo' e 'bambino': 0.8620923331020662
Somiglianza tra 'ragazzo' e 'donna': 0.8434832832512844
Somiglianza tra 'uomo' e 'cane': 0.815288305936189
Somiglianza tra 'Italia' e 'Roma': 0.891014763666595
Somiglianza tra 'Switzerland ' e 'Zurich': 0.911240100712202
Somiglianza tra 'California' e 'Los Angeles': 0.8623730743290187


In [5]:
story1 = "Once upon a time, there was a little girl named Sarah. She lived with her family in a small village near the woods. Every morning Sarah would wake up early, get dressed, and go outside to play."
story2 = "One day, while Sarah was playing in the woods, she noticed a small rabbit hopping around in the grass. She decided to follow it, hoping to see where it would go. The rabbit kept hopping until it reached the entrance of a small cave."
insurance_clause = "In the event of any losses or damages incurred by either party due to unforeseen circumstances, both parties agree to be liable for their respective liabilities and hold the other harmless for any and all damages and losses sustained."

get_similarity_from_text(story1, story2)
get_similarity_from_text(story1, insurance_clause)

Somiglianza tra 'Once upon a time, there was a little girl named Sarah. She lived with her family in a small village near the woods. Every morning Sarah would wake up early, get dressed, and go outside to play.' e 'One day, while Sarah was playing in the woods, she noticed a small rabbit hopping around in the grass. She decided to follow it, hoping to see where it would go. The rabbit kept hopping until it reached the entrance of a small cave.': 0.9084378968909292
Somiglianza tra 'Once upon a time, there was a little girl named Sarah. She lived with her family in a small village near the woods. Every morning Sarah would wake up early, get dressed, and go outside to play.' e 'In the event of any losses or damages incurred by either party due to unforeseen circumstances, both parties agree to be liable for their respective liabilities and hold the other harmless for any and all damages and losses sustained.': 0.7223016948630127


In [7]:
import pandas as pd

df = pd.read_csv('data/libraries_dataset.csv', header=0, sep=';')

print('')

# only keep library_name and description column
df = df[['library_name', 'description']]

# add embedding column with embedding
df['embedding'] = df['description'].apply(lambda x: embeddings.embed_query(x))

print('DataFrame popolato')


DataFrame popolato


In [8]:
# Let's pick a movie that exists in df, keeping in mind we only have 500 movies in it!
request = "I need a python library to parse html code retrived with an HTTP request"
# "I need a python library to make graphs and visualizations" 
# "I need a python library to parse html code retrived with an HTTP request"

# get embedding for movie
# e = df[df['library_name'] == request]['embedding'].values[0]

# get embedding for request
request_embedding = embeddings.embed_query(request)

# get cosine similarity between movie and all other movies and sort ascending
similarities = df['embedding'].apply(lambda x: cosine_similarity(x, request_embedding))

# combine library_name from df and similiaries and sort ascending by similarity
recommendations = pd.concat([df['library_name'], similarities], axis=1).sort_values(by='embedding', ascending=False)
recommendations.head(10)

Unnamed: 0,library_name,embedding
9,BeautifulSoup,0.819023
8,Scrapy,0.817715
6,Scikit-learn,0.761493
3,Pandas,0.758129
7,PyTorch,0.755811
0,TensorFlow,0.7557
1,SciPy,0.746985
4,Matplotlib,0.724728
5,Keras,0.711553
2,Numpy,0.703839


In [None]:
import pandas as pd

# generate an array with different city names
cities = ['Jakarta', 'Hong Kong', 'Tokyo', 'Bangkok', 'Shanghai', 'Ho Chi Minh City', 'Beijing',
          'New York', 'Los Angeles', 'San Francisco', 'Chicago',
          'Paris', 'Rome', 'Barcelona', 'Madrid', 'Amsterdam', 'Berlin']

cities_df = pd.DataFrame({"city": cities})

# generate embeddings for all cities
cities_df['cities_embeddings'] = [embeddings.embed_query(city) for city in cities]


In [None]:
# use PCA to reduce dimensionality to 3
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
t = pca.fit_transform(cities_df['cities_embeddings'].tolist())


In [None]:
# draw t in a scatter plot and put names on each point
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(t[:,0], t[:,1], t[:,2])
for i, txt in enumerate(cities):
    ax.text(t[i,0], t[i,1], t[i,2], txt)
plt.show()
