In [45]:
import pandas as pd
import pickle

EMBEDDING_MODEL = "text-embedding-3-small"
SAVE_EMBDEDED_PKL   = f'Saved/textembed ({EMBEDDING_MODEL}).pkl'
API_KEY     = open("key.txt", "r").read().strip("\n")

def save_data(data, filename):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)
    print(f"Data saved to {filename}")

def load_data(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)
 
data = load_data(SAVE_EMBDEDED_PKL)
df = pd.DataFrame(data, columns=['Document', 'Page', 'Block', "WordCount", 'Text', 'Embedding'])

In [46]:
from openai import OpenAI
client = OpenAI(api_key=API_KEY)

def get_embedding(text):
    response = client.embeddings.create(
        input=text,
        model=EMBEDDING_MODEL
    )
    # Assuming the response structure fits this format; adjust according to actual API response
    return response.data[0].embedding

user_question = "Jaką biżuterie kupują ludzie na mazowszu?"
user_embedded = get_embedding(user_question)
save_data(user_embedded, "Saved/Q1.pkl")


Data saved to Saved/Q1.pkl


In [47]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle


user_embedded = load_data("Saved/Q1.pkl")

# Convert embeddings into a format suitable for sklearn's cosine_similarity function
text_embeddings = np.array(df['Embedding'].tolist())
question_embedding = np.array([user_embedded])

# Calculate cosine similarity
similarity_scores = cosine_similarity(question_embedding, text_embeddings)[0]

# Append similarity scores as a percentage to the DataFrame
df['Similarity'] = similarity_scores * 100

# Sort the DataFrame based on similarity scores in descending order
df_sorted = df.sort_values(by='Similarity', ascending=False)

In [48]:
# Assuming 'df_sorted' is your DataFrame sorted by 'Similarity' in descending order
top_5 = df_sorted.head(5)

# Initialize an empty list to hold formatted answers
formatted_answers = []

from IPython.display import display, HTML

# Create an HTML template for displaying each answer
html_template = """
<div style="margin-bottom: 20px; background-color: #000; color: #fff; padding: 10px;">
    <p><strong>{similarity:.2f}% similarity</strong> in Document {document}, Page {page}, Block {block}</p>
    <blockquote style="margin: 10px 0; padding: 10px; background-color: #333; border-left: 5px solid #ccc; color: #fff;">
        {text}
    </blockquote>
</div>
"""


# Initialize an empty string to hold the entire HTML content
html_content = ""

# Loop through each row in the top 5 results
for index, row in top_5.iterrows():
    # Use the HTML template and format it with the current row's details
    formatted_html = html_template.format(
        similarity=row['Similarity'],
        document=row['Document'],
        page=row['Page'],
        block=row['Block'],
        text=row['Text'].replace("\n", "<br>"))  # Replace newlines with HTML line breaks for correct rendering
    
    # Append the formatted HTML to the content string
    html_content += formatted_html

# Display the formatted HTML content in the Jupyter Notebook
print(user_question)
display(HTML(html_content))




Jaką biżuterie kupują ludzie na mazowszu?
