### Importy i ustawienia

In [1]:
import pandas as pd
import pickle
import ourfuncs as funk
from openai import OpenAI
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display, HTML


EMBEDDING_MODEL     = "text-embedding-3-small"
SAVE_EMBDEDED_PKL   = f'Saved/textembed ({EMBEDDING_MODEL}).pkl'
API_KEY             = open("key.txt", "r").read().strip("\n")
TOP_X               = 5 #ile najtrafniejszych zwracać

client  = OpenAI(api_key=API_KEY)
data    = funk.load_data(SAVE_EMBDEDED_PKL)
df      = pd.DataFrame(data, columns=['Document', 'Page', 'Block', "WordCount", 'Text', 'Embedding'])

### Pytanie i za-embedowanie pytania

In [2]:
def get_embedding(text):
    response = client.embeddings.create(
        input=text,
        model=EMBEDDING_MODEL
    )
    # Assuming the response structure fits this format; adjust according to actual API response
    return response.data[0].embedding

user_question = "za co odpowiada zarząd wspólnoty?"
user_embedded = get_embedding(user_question)
funk.save_data(user_embedded, "Saved/Q1.pkl")

Data saved to Saved/Q1.pkl


### Kalkulacja trafności tekstu
Czyli porównanie embeddingu pytania do embeddingu każdego z tekstów

In [3]:
# !! Convert embeddings into a format suitable for sklearn's cosine_similarity function
text_embeddings     = np.array(df['Embedding'].tolist())
question_embedding  = np.array([user_embedded])

# Calculate cosine similarity
similarity_scores   = cosine_similarity(question_embedding, text_embeddings)[0]

# Append similarity scores as a percentage to the DataFrame
df['Similarity']    = similarity_scores * 100
#df

### Formatowanie odpowiedzi

In [4]:
# Zwracamy top X
df_sort             = df.sort_values(by='Similarity', ascending=False)
top_X               = df_sort.head(TOP_X)
formatted_answers   = []
html_content        = ""

# Template w HTML do wyświetlania odpowiedzi (ładniej)
html_template = """
<div style="margin-bottom: 20px; background-color: #000; color: #fff; padding: 10px;">
    <p><strong>{similarity:.2f}% similarity</strong> in Document {document}, Page {page}, Block {block}</p>
    <blockquote style="margin: 10px 0; padding: 10px; background-color: #333; border-left: 5px solid #ccc; color: #fff;">
        {text}
    </blockquote>
</div>
"""

# Zbudowanie odpowiedzi dla każdego z 5 wyników w zbiorczą odpowiedź
for index, row in top_X.iterrows():
    # Use the HTML template and format it with the current row's details
    formatted_html  = html_template.format(
        similarity  =row['Similarity'],
        document    =row['Document'],
        page        =row['Page'],
        block       =row['Block'],
        text        =row['Text'].replace("\n", "<br>"))  
    # dodanie tej z yop wyików do zbioczej zwrotki
    html_content    += formatted_html

### Odpowiedź

In [5]:
# Display the formatted HTML content in the Jupyter Notebook
print(user_question)
display(HTML(html_content))

za co odpowiada zarząd wspólnoty?
