In [10]:
import pandas as pd
import openai
import numpy as np
from openai.embeddings_utils import distances_from_embeddings, cosine_similarity

In [11]:
df = pd.read_csv('processed/embeddings.csv', index_col=0)
print("start apply")
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
print("end apply")
df.head()

start apply
end apply


Unnamed: 0,text,n_tokens,embeddings
0,atalog.utdallas.edu 2013 graduate admission re...,464,"[-0.006768117658793926, -0.018370606005191803,..."
1,Academic Good Standing Registration in the gra...,490,"[0.017427491024136543, -0.014574022963643074, ..."
2,Paying Fees as a Part of Registration A studen...,465,"[9.117217996390536e-05, -0.014734090305864811,..."
3,The following guidelines describe whether or n...,501,"[0.0068955812603235245, -0.008433591574430466,..."
4,Such courses with an earned grade of 'B' or be...,493,"[-0.005029195919632912, -0.016063299030065536,..."


In [None]:
openai.api_key = 'APIKEY'

In [None]:

def create_context(
        question, df, max_len=1800, size="ada"
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    # print("Start embeddings")
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
    # print("End embeddings")
    # Get the distances from the embeddings
    # print("Start calculating cosine similarity")
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')
    # print("End calculating cosine similarity")
    returns = []
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    # print("start sort by distance")
    for i, row in df.sort_values('distances', ascending=True).iterrows():

        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4

        # If the context is too long, break
        if cur_len > max_len:
            break

        # Else add it to the text that is being returned
        returns.append(row["text"])

    # Return the context
    return "\n\n###\n\n".join(returns)


def answer_question(
        df,
        model="text-davinci-003",
        question="What classes do I need to take to graduate?",
        max_len=1800,
        size="ada",
        debug=False,
        max_tokens=150,
        stop_sequence=None
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(
        question,
        df,
        max_len=max_len,
        size=size,
    )
    # print("end sort by distance")
    # If debug, print the raw model response
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
        # Create a completions using the questin and context
        response = openai.Completion.create(
            prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"Unfortunately, I have not been trained to answer that question yet (If you would like to help finacially support the text embedding process, please contact the creators after this presentation)\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=stop_sequence,
            model=model,
        )
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(e)
        return ""


################################################################################
### Step 13
################################################################################

def askGPT(question, messages):

    textcompletions = answer_question(df, question=question, debug=False)
    messages.append({"role": "user", "content": f"{question} ### text embeddings: {textcompletions}"})
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages
    )
    messages.append(
        {"role": response['choices'][0]['message']['role'],"content": response['choices'][0]['message']['content']}
    )
    return {response['choices'][0]['message']['content']}
#
# print(answer_question(df, question="How can I start the graduation process?", debug=False))
#
# print(answer_question(df, question="What are the core courses that I have to take?"))

In [None]:
messages = [{"role": "system", "content": f"You are an AI academic advisor named GradGuide that has been trained on text embeddings of the UT Dallas catalogs. Use text embeddings to inform your responses"}]
print("Hello, I am GradGuide, your personal AI academic advisor. Can I help you with anything today?")
while True:
    myQn = input()
    print("")
    print(askGPT(myQn, messages))