In [2]:
import pandas as pd
import openai
from sklearn.metrics.pairwise import cosine_similarity

def add_similarity(df, given_embedding):
    def calculate_similarity(embedding):
        # Check if embedding is a string and convert it to a list of floats if necessary
        if isinstance(embedding, str):
            embedding = [float(x) for x in embedding.strip('[]').split(',')]
        return cosine_similarity([embedding], [given_embedding])[0][0]

    df['similarity'] = df['embedding'].apply(calculate_similarity)
    return df

def top_similar_entries(df, x=3):
    """
    Return the top x entries in the "Synthesis Information" column based on the highest similarity values.

    :param df: The DataFrame containing the "similarity" and "Synthesis Information" columns.
    :param x: The number of top entries to return. Default is 3.
    :return: A string containing the top x entries in the "Synthesis Information" column, separated by new lines.
    """
    # Sort the DataFrame based on the "similarity" column in descending order
    sorted_df = df.sort_values(by="similarity", ascending=False)

    # Get the top x entries from the "Synthesis Information" column
    top_x_entries = sorted_df["Synthesis Information"].head(x).tolist()

    # Add separator line with MOF Name if x is equal or larger than 2
    if x >= 2:
        for i, entry in enumerate(top_x_entries):
            mof_name = entry.split("\n")[0].replace("MOF Name: ", "")
            separator = f"--- SECTION {i + 1}: {mof_name} ---"
            top_x_entries[i] = separator + "\n" + entry

    # Join the entries together with new lines
    joined_entries = "\n".join(top_x_entries)

    return joined_entries


def chatbot(question, past_user_messages=None, initial_context=None):
    if past_user_messages is None:
        past_user_messages = []

    past_user_messages.append(question)

    file_name = "Synthesis_Embedding.csv" #synthesis information database with embedding
    df_with_emb = pd.read_csv(file_name)

    if initial_context is None:
        # Find the context based on the first question
        first_question = past_user_messages[0]
        question_return = openai.Embedding.create(model="text-embedding-ada-002", input=first_question)
        question_emb = question_return['data'][0]['embedding']

        df_with_emb_sim = add_similarity(df_with_emb, question_emb)
        num_paper = 3
        top_n_synthesis_str = top_similar_entries(df_with_emb_sim, num_paper)

        print("I have found below synthesis conditions and paper information based on your first question:")
        print("\n" + top_n_synthesis_str)
        initial_context = top_n_synthesis_str

    message_history = [
        {
            "role": "system",
            "content": "You are a chemistry assistant that specifically handles questions related to MOF synthesis conditions based on the papers you have reviewed. Answer the question using the provided context. If the question is not relevant to the context or the MOF is not mentioned in the context, respond with 'Based on the information available from the MOF paper I have read so far, I cannot provide a reliable answer to this question. Please revise your question.'\n\nContext:\n" + initial_context
        },
    ]

    for user_question in past_user_messages:
        message_history.append({"role": "user", "content": user_question})

    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        #temperature=0.8,
        #max_tokens=2000,
        messages=message_history
    )

    answer = response.choices[0].message["content"]
    return answer, initial_context, past_user_messages




openai.api_key = "Add Your OpenAI API KEY Here."

# Example usage:
first_question = "What is the linker used to synthesize MOF-520?"
answer, initial_context, past_user_messages = chatbot(first_question)
print(answer)

follow_up_question = "Well, so how to make this MOF?"
answer, _, past_user_messages = chatbot(follow_up_question, past_user_messages, initial_context)
print(answer)