In [1]:
from dotenv import load_dotenv
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema.messages import SystemMessage
from langchain.vectorstores.pgvector import PGVector
import textwrap
from langchain.output_parsers import PydanticOutputParser
from question_generator_model import SingleSelection, Code, AnyQuestion, FillInBlank, MultipleSelection

load_dotenv("/home/jupyteach-msda/jupyteach-ai/.env")

COLLECTION_NAME = "documents"
DB_CONNECTION = "postgresql://postgres:supa-jupyteach@192.168.0.77:54328/postgres"


def get_vectorstore():
    embeddings = OpenAIEmbeddings()

    db = PGVector(embedding_function=embeddings,
        collection_name=COLLECTION_NAME,
        connection_string=DB_CONNECTION,
    )
    return db

In [2]:
#Function that takes the input and returns the output from the retreival agent
def create_chain(
        system_message_text, 
        temperature=0, 
        model_name="gpt-3.5-turbo-1106", 
        model_kwargs={"response_format": {"type": "json_object"}},
        verbose=False,
    ):
    # step 1: create llm
    retriever = get_vectorstore().as_retriever()
    llm = ChatOpenAI(temperature=temperature, model_name=model_name, model_kwargs=model_kwargs, verbose=verbose)
    
    # step 2: create retriever tool
    tool = create_retriever_tool(
        retriever,
        "search_course_content",
        "Searches and returns documents regarding the contents of the course and notes from the instructor.",
    )
    tools = [tool]

    # step 3: create system message from the text passed in as an argument
    system_message = SystemMessage(content=system_message_text)

    # return the chain
    return create_conversational_retrieval_agent(
        llm=llm, 
        tools=tools, 
        verbose=verbose, 
        system_message=system_message
    )

In [4]:
#Function to check if the retrieval is happening
def report_on_message(msg):
    print("any intermediate_steps?: ", len(msg["intermediate_steps"]) > 0)
    print("output:\n", msg["output"])
    print("\n\n")

In [12]:
#Fucntion that returns the system prompt with the format of the question requested 
def create_system_prompt(pydantic_object):
    common_system_prompt = textwrap.dedent("""
    You are a smart, helpful teaching assistant chatbot named AcademiaGPT.

    You are an expert Python programmer with 15+ year experience and have used all the most popular
    libraries for data analysis, machine learning, and artificial intelligence.

    You assist professors that teach courses about Python, data science, and machine learning
    to college students.

    Your task is to help professors produce practice questions to help students solidify 
    their understanding of specific topics

    In your conversations with a professor you  will be given a topic (string) and an
    expected difficulty level (integer)
    
    The difficulty will be a number between 1 and 3, with 1 corresponding to a request 
    for an easy question, and 3 for the most difficult question.

    If the difficulty or the topic is not mentioned by the professor in the message for you to generate a question, use the difficulty or topic or both from the previous question generated.

    You must always generate questions that have more than one option as solution for the MultipleSelection question type.

    
    Occasionaly the professor may ask you to do something like produce a similar question,
    or try again and make it more difficult or easy. You need to assist the professor with the same.

    You are encouraged to use any tools available to look up relevant informati on, onlyif necessary.

    If the professor ask for more than one question in a single message, you need to apologize and inform that you can only generate one 
    question at a time. You need to also ask the professor to put in a new message with the topic and difficulty to generate a new question.

    Your responses must always exactly match the specified JSON format with no extra words or content.

    You must always produce exactly one JSON object.

    Your responses should always be consistent.
    
    {format_instructions}
    """)

    parser = PydanticOutputParser(pydantic_object=pydantic_object)
    return common_system_prompt.format(format_instructions=parser.get_format_instructions())

In [6]:
#Fucntion that takes the input, call the retreiver agent, and returns the parsed output
def generate_and_parse_question(pydantic_model, query):
    rag_chain = create_chain(create_system_prompt(pydantic_model), temperature=0.1, verbose=True, model_name="gpt-4-1106-preview")
    response = rag_chain(query)
    report_on_message(response)  # print a summary of what was produced
    parser = PydanticOutputParser(pydantic_object=pydantic_model)
    return parser.parse(response["output"])

In [13]:
generate_and_parse_question(MultipleSelection, "pandas groupby with difficulty 2")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
  "description": "Question where user is presented a prompt in `question_text` and \na list of `choices`. They are supposed to provide all answers that\napply (`solution`)\n\nAll questions must have a minimum of 3 options\n\nExamples\n--------\n{\n  \"question_text\": \"Using the pandas `groupby` method, which of the following operations can you perform after grouping a DataFrame by one or more columns?\",\n  \"difficulty\": 2,\n  \"topics\": [\"pandas\", \"data analysis\", \"groupby\"],\n  \"choices\": [\n    \"Calculate the sum of each group\",\n    \"Find the maximum value in each group\",\n    \"Perform a linear regression on each group\",\n    \"Apply a custom function to each group\"\n  ],\n  \"solution\": [0, 1, 3]\n}",
  "properties": {
    "question_text": {
      "description": "The main text of the question. Markdown formatted",
      "title": "Question Text",
      "type": "string"
    },
    "difficulty": {
   

ValidationError: 5 validation errors for MultipleSelection
question_text
  Field required [type=missing, input_value={'description': 'Question... 'choices', 'solution']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
difficulty
  Field required [type=missing, input_value={'description': 'Question... 'choices', 'solution']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
topics
  Field required [type=missing, input_value={'description': 'Question... 'choices', 'solution']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
choices
  Field required [type=missing, input_value={'description': 'Question... 'choices', 'solution']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
solution
  Field required [type=missing, input_value={'description': 'Question... 'choices', 'solution']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing

In [11]:
generate_and_parse_question(MultipleSelection, "topic scale-free network, difficulty 1 ")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
  "question_text": "What are the characteristics of a scale-free network?",
  "difficulty": 1,
  "topics": ["scale-free network"],
  "choices": [
    "The network follows a power-law degree distribution",
    "All nodes have the same number of connections",
    "The network is immune to random failures",
    "Most nodes have a few connections, while a few nodes have many connections"
  ],
  "solution": [0, 3]
}[0m

[1m> Finished chain.[0m
any intermediate_steps?:  False
output:
 {
  "question_text": "What are the characteristics of a scale-free network?",
  "difficulty": 1,
  "topics": ["scale-free network"],
  "choices": [
    "The network follows a power-law degree distribution",
    "All nodes have the same number of connections",
    "The network is immune to random failures",
    "Most nodes have a few connections, while a few nodes have many connections"
  ],
  "solution": [0, 3]
}





What are the characteristics of a scale-free network?

- [x] The network follows a power-law degree distribution
- [ ] All nodes have the same number of connections
- [ ] The network is immune to random failures
- [x] Most nodes have a few connections, while a few nodes have many connections


In [98]:
generate_and_parse_question(SingleSelection, "convex optimization, difficulty 3")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search_course_content` with `convex optimization`


[0m[36;1m[1;3m[Document(page_content="these model these problems are that they use the same statistical model F of a given data. There are some tools that allow us to solve the direct problem which again relates to simulating artificial data and these tools in fact turn out to be helpful when solving the inverse problem or learning something about the parameter vector theta. We're going to be continuing in this general framework of a statistical model the direct problem in the inverse problem as we talk today about the lake model of unemployment that we've been working with and we want to make sure that as we work through the materials in this lecture that you keep this overall framework in mind and our goal today will be to use the lake model and its parameters and inferences we can make about its parameters given real world data from the BLS and it will help

In the context of convex optimization, which of the following statements are true regarding the use of a loss function?

- [x] A loss function allows us to move from a qualitative notion of how good a model fits the data to a very quantitative version.
- [ ] A loss function is only useful for qualitative comparisons between models.
- [ ] The mean squared error (MSE) is an example of a loss function that can be used to evaluate the performance of a model.


In [100]:
generate_and_parse_question(SingleSelection, "2 more questions on convex optimization")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search_course_content` with `convex optimization`


[0m[36;1m[1;3m[Document(page_content="these model these problems are that they use the same statistical model F of a given data. There are some tools that allow us to solve the direct problem which again relates to simulating artificial data and these tools in fact turn out to be helpful when solving the inverse problem or learning something about the parameter vector theta. We're going to be continuing in this general framework of a statistical model the direct problem in the inverse problem as we talk today about the lake model of unemployment that we've been working with and we want to make sure that as we work through the materials in this lecture that you keep this overall framework in mind and our goal today will be to use the lake model and its parameters and inferences we can make about its parameters given real world data from the BLS and it will help

ValidationError: 5 validation errors for SingleSelection
question_text
  Field required [type=missing, input_value={'description': 'Question... 'choices', 'solution']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
difficulty
  Field required [type=missing, input_value={'description': 'Question... 'choices', 'solution']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
topics
  Field required [type=missing, input_value={'description': 'Question... 'choices', 'solution']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
choices
  Field required [type=missing, input_value={'description': 'Question... 'choices', 'solution']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
solution
  Field required [type=missing, input_value={'description': 'Question... 'choices', 'solution']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing

In [87]:
generate_and_parse_question(MultipleSelection, "Give me a more difficult question on the convex optimization")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search_course_content` with `convex optimization`


[0m[36;1m[1;3m[Document(page_content="these model these problems are that they use the same statistical model F of a given data. There are some tools that allow us to solve the direct problem which again relates to simulating artificial data and these tools in fact turn out to be helpful when solving the inverse problem or learning something about the parameter vector theta. We're going to be continuing in this general framework of a statistical model the direct problem in the inverse problem as we talk today about the lake model of unemployment that we've been working with and we want to make sure that as we work through the materials in this lecture that you keep this overall framework in mind and our goal today will be to use the lake model and its parameters and inferences we can make about its parameters given real world data from the BLS and it will help

Which of the following statements are true regarding the use of loss functions in convex optimization?

- [x] Loss functions allow for a quantitative comparison between models.
- [ ] Loss functions are only applicable for linear models.
- [x] The mean squared error (MSE) is an example of a loss function used in convex optimization.
- [ ] Loss functions can only be used when the model parameters are known with certainty.


In [109]:
generate_and_parse_question(MultipleSelection, "Give me two more questions on convex optimization, difficulty 1.")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
  "apology": "I apologize, but I can only generate one question at a time. Please provide the topic and difficulty for the next question you would like to generate."
}[0m

[1m> Finished chain.[0m
any intermediate_steps?:  False
output:
 {
  "apology": "I apologize, but I can only generate one question at a time. Please provide the topic and difficulty for the next question you would like to generate."
}





ValidationError: 5 validation errors for MultipleSelection
question_text
  Field required [type=missing, input_value={'apology': 'I apologize,...ould like to generate.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
difficulty
  Field required [type=missing, input_value={'apology': 'I apologize,...ould like to generate.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
topics
  Field required [type=missing, input_value={'apology': 'I apologize,...ould like to generate.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
choices
  Field required [type=missing, input_value={'apology': 'I apologize,...ould like to generate.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing
solution
  Field required [type=missing, input_value={'apology': 'I apologize,...ould like to generate.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.4/v/missing