In [None]:
!pip install openai==1.31.0
!pip install streamlit==1.32.2

In [None]:
from openai import OpenAI
client = OpenAI(api_key="your api key")

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval

datafile_path = "/content/drive/MyDrive/OpenAIAPIApp_Lecture/data/embeddings_data/157recipes.csv"

df = pd.read_csv(datafile_path, encoding='shift-jis')
df

In [None]:
texts = []
for i in range(157):
  text = '料理名: ' + df.iloc[i, 0] + ', 材料: ' + df.iloc[i, 1] + ', 調理手順: ' + df.iloc[i, 2]
  texts.append(text)

text_col = pd.Series(texts, name='結合後')
con_df = pd.concat([df, text_col], axis=1)
con_df

In [None]:
embeddings = []

for i in range(157):
  response = client.embeddings.create(
    model="text-embedding-3-small",
    input=con_df['結合後'][i]
  )
  embeddings.append(response.data[0].embedding)

embedding_col = pd.Series(embeddings, name='embedding')
embedding_df = pd.concat([con_df, embedding_col], axis=1)
embedding_df.to_csv('/content/drive/MyDrive/OpenAIAPIApp_Lecture/data/embeddings_data/157recipes_embedding.csv', encoding='utf8', index_label=False)

In [None]:
%%writefile app.py
from openai import OpenAI
import streamlit as st
import numpy as np
import pandas as pd
from ast import literal_eval
from drive.MyDrive.OpenAIAPIApp_Lecture.utils.embeddings_utils import get_embedding, cosine_similarity

client = OpenAI(api_key="your api key")
datafile_path = '/content/drive/MyDrive/OpenAIAPIApp_Lecture/data/embeddings_data/157recipes_embedding.csv'
embedding_df = pd.read_csv(datafile_path)
embedding_df['embedding'] = embedding_df.embedding.apply(literal_eval).apply(np.array)

def search_recipes(embedding_df, recipe_description, n=3):
    recipe_embedding = get_embedding(
        recipe_description,
        model="text-embedding-3-small"
    )
    embedding_df["similarity"] = embedding_df.embedding.apply(lambda x: cosine_similarity(x, recipe_embedding))

    results = (
        embedding_df.sort_values("similarity", ascending=False)
        .head(n)
    )
    return results

st.title("レシピ検索アプリケーション")
user_input = st.text_input("検索したいレシピのキーワードを入力してください:")

if st.button("送信"):
    recipe = search_recipes(embedding_df, user_input, n=3)
    for i in range(3):
        st.text_area(f"レシピ{i+1}", f"料理名: {recipe.iloc[i, 0]}\n材料: {recipe.iloc[i, 1]}\n調理手順: {recipe.iloc[i, 2]}")

In [None]:
# streamlitのrunコマンドでapp.pyを立ち上げ、localtunnelを用いてアプリ公開
!streamlit run app.py & sleep 3 && npx localtunnel --port 8501

In [None]:
from sklearn.cluster import KMeans

n_clusters = 4
matrix = np.vstack(embedding_df.embedding.values)

kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
embedding_df["Cluster"] = labels

In [None]:
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(matrix)

x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]

for category, color in enumerate(["purple", "green", "red", "blue"]):
    xs = np.array(x)[embedding_df.Cluster == category]
    ys = np.array(y)[embedding_df.Cluster == category]
    plt.scatter(xs, ys, color=color, alpha=0.3)

    avg_x = xs.mean()
    avg_y = ys.mean()

    plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
plt.title("Clusters identified visualized in language 2d using t-SNE")

In [None]:
# Reading a review which belong to each group.
rec_per_cluster = 5

for i in range(n_clusters):
    print(f"Cluster {i} Theme:", end=" ")

    recipes = "\n".join(
        embedding_df[embedding_df.Cluster == i]["結合後"].values
    )

    messages = [
        {"role": "user", "content": f'以下の料理に共通する要素は何？\n\n料理の概要:\n"""\n{recipes}\n"""\n\nテーマ:'}
    ]

    response = client.chat.completions.create(
        model="gpt-4",
        messages=messages,
        temperature=0,
        max_tokens=64,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0)
    print(response.choices[0].message.content.replace("\n", ""))

    sample_cluster_rows = df[df.Cluster == i].sample(rec_per_cluster, random_state=0)
    for j in range(rec_per_cluster):
        print(sample_cluster_rows['料理名'].values[j], end=": ")
        print(sample_cluster_rows['材料'].values[j], end=": ")
        print(sample_cluster_rows['調理手順'].str[:70].values[j])

    print("-" * 100)