In [1]:
from dotenv import load_dotenv
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema.messages import SystemMessage
from langchain.vectorstores.pgvector import PGVector
import textwrap
from langchain.output_parsers import PydanticOutputParser
from question_generator_model import SingleSelection, Code, AnyQuestion, FillInBlank, MultipleSelection

load_dotenv("/home/jupyteach-msda/jupyteach-ai/.env")

COLLECTION_NAME = "documents"
DB_CONNECTION = "postgresql://postgres:supa-jupyteach@192.168.0.77:54328/postgres"


def get_vectorstore():
    embeddings = OpenAIEmbeddings()

    db = PGVector(embedding_function=embeddings,
        collection_name=COLLECTION_NAME,
        connection_string=DB_CONNECTION,
    )
    return db

In [2]:
def create_chain(
        system_message_text, 
        temperature=0, 
        model_name="gpt-3.5-turbo-1106", 
        model_kwargs={"response_format": {"type": "json_object"}},
        verbose=False,
    ):
    # step 1: create llm
    retriever = get_vectorstore().as_retriever()
    llm = ChatOpenAI(temperature=temperature, model_name=model_name, model_kwargs=model_kwargs, verbose=verbose)
    
    # step 2: create retriever tool
    tool = create_retriever_tool(
        retriever,
        "search_course_content",
        "Searches and returns documents regarding the contents of the course and notes from the instructor.",
    )
    tools = [tool]

    # step 3: create system message from the text passed in as an argument
    system_message = SystemMessage(content=system_message_text)

    # return the chain
    return create_conversational_retrieval_agent(
        llm=llm, 
        tools=tools, 
        verbose=False, 
        system_message=system_message
    )

In [3]:
#Function to check if the retrieval is happening
def report_on_message(msg):
    print("any intermediate_steps?: ", len(msg["intermediate_steps"]) > 0)
    print("output:\n", msg["output"])
    print("\n\n")

In [12]:
from pydantic import ValidationError
import json
from json.decoder import JSONDecodeError

# Function that takes the input, calls the retriever agent, and returns the parsed output
def generate_and_parse_question(pydantic_model, query):
    rag_chain = create_chain(create_system_prompt(pydantic_model), temperature=0.1, verbose=True, model_name="gpt-4-1106-preview")
    
    try:
        response = rag_chain(query)
        #report_on_message(response)  # print a summary of what was produced
        parser = PydanticOutputParser(pydantic_object=pydantic_model)
        return parser.parse(response["output"])
    except ValidationError as ve:
        print(f"Pydantic validation error: {ve}")
        # If Pydantic validation fails, fallback to json.loads
        return json.loads(response["output"])
    except JSONDecodeError as json_error:
        # If JSON decoding fails, perform json.loads and inform the caller about the error
        result_output = json.loads(response["output"])
        print(f"JSON decoding error: {json_error}")
        return result_output
    except Exception as e:
        print(f"An error occurred: {e}")
        # Handle other exceptions and fallback to json.loads
        return json.loads(response["output"])

In [5]:
#Fucntion that returns the system prompt with the format of the question requested 
def create_system_prompt(pydantic_object):
    common_system_prompt = textwrap.dedent("""
    You are a smart, helpful teaching assistant chatbot named AcademiaGPT.

    You are an expert Python programmer and have used all the most popular
    libraries for data analysis, machine learning, and artificial intelligence.

    You assist professors that teach courses about Python, data science, and machine learning
    to college students.

    Your task is to help professors produce practice questions to help students solidify 
    their understanding of specific topics

    In your conversations with a professor, you  will be given a topic and an
    expected difficulty level (integer) or (string). 

    If the difficulty is not given assume the difficulty level to be the previously used difficulty level.

    

    Here is an example question with difficulty 1

      {{
        "question_text":"How would you reverse the order of the following list in python\n\n```python\na = [1, 'hi', 3, 'there']\n```\n\nand save the result in an object `b`",
        "starting_code":"a = [1, 'hi', 3, 'there']\n# Reverse the order of the list and save the result in an object called b",
        "solution":"a = [1, 'hi', 3, 'there']\nb = a[::-1]",
        "topics":["python","programming","lists"],
        "difficulty":1,
        "setup_code":"# none",
        "test_code":"assert b == ['there', 3, 'hi', 1]"
        }}


    Here is an example question with difficulty 2

    {{"question_text": "Given a list of stock prices `prices` for consecutive days, write a for loop that calculates the total return of the stock over the period. The total return is defined as the percentage change from the first day to the last day. Store the result in a variable named `total_return`.",
    "starting_code": "prices = [100, 102, 105, 110, 108]\n# Calculate the total return and store it in total_return",
    "solution": "prices = [100, 102, 105, 110, 108]\nfirst_price = prices[0]\nlast_price = prices[-1]\ntotal_return = ((last_price - first_price) / first_price) * 100",
    "topics": ["for loops", "asset pricing"],
    "difficulty": 2,
    "setup_code": "# No setup code required",
    "test_code": "assert abs(total_return - ((prices[-1] - prices[0]) / prices[0]) * 100) < 1e-6"
    }}



    Here is an example question with difficulty 3

    {{
        "question_text":"You are given a 3 dimensional numpy array as specified below:\n\n```\nA = np.array([[[0.0, 1.0], [2.0, 3.0]], [[4.0, 5.0], [6.0, 7.0]]])\n```\n\nCreate a variable `idx` (define as a tuple) that you could use to select the `4.0` element of this array.\n\nFor example,\n\n```\nidx = (0, 0, 0)\n```\n\nwould select the `0.0` element of the array.",
        "starting_code":"idx = (0, 0, 0)  # Fill this in with the correct index",
        "solution":"x = (1, 0, 0)",
        "topics":["numpy"],
        "difficulty":3,
        "setup_code":"import numpy as np\n\nA = np.array([[[0.0, 1.0], [2.0, 3.0]], [[4.0, 5.0], [6.0, 7.0]]])",
        "test_code":"assert A[idx] == A[1, 0, 0]"
      }}
    
    If you are asked to give similar, easier, or another question, the user wants you to use the same topic and difficulty 
    level that you used to generate the previous question.

    You are encouraged to use any tools available to look up relevant information, only
    if necessary.

    Your responses must always exactly match the specified JSON format with no extra words or content.

    You must always produce exactly one JSON object.
    
    {format_instructions}
    """)

    parser = PydanticOutputParser(pydantic_object=pydantic_object)
    return common_system_prompt.format(format_instructions=parser.get_format_instructions())

In [14]:
generate_and_parse_question(Code, "topic: pandas groupby and difficulty:2")

Given a DataFrame `df` with columns `['year', 'month', 'sales']`, write a Pandas command to group the data by `year` and calculate the total `sales` for each year. Store the result in a DataFrame named `yearly_sales`.

```python
import pandas as pd

# Assume df is predefined
# Group by 'year' and calculate the total sales for each year
yearly_sales = ...
```

**Solution**

```python
import pandas as pd

df = pd.DataFrame({'year': [2018, 2019, 2018, 2019, 2020], 'month': [1, 1, 2, 2, 1], 'sales': [200, 250, 150, 300, 400]})
yearly_sales = df.groupby('year')['sales'].sum().reset_index()
```

**Test Suite**

```python
import pandas as pd
df = pd.DataFrame({'year': [2018, 2019, 2018, 2019, 2020], 'month': [1, 1, 2, 2, 1], 'sales': [200, 250, 150, 300, 400]})

import pandas as pd

df = pd.DataFrame({'year': [2018, 2019, 2018, 2019, 2020], 'month': [1, 1, 2, 2, 1], 'sales': [200, 250, 150, 300, 400]})
yearly_sales = df.groupby('year')['sales'].sum().reset_index()

assert yearly_sales.equals(pd.DataFrame({'year': [2018, 2019, 2020], 'sales': [350, 550, 400]}))
```

In [15]:
generate_and_parse_question(MultipleSelection, "topic: for loops and asset pricing and difficulty 1")

Consider a list `prices` representing the closing stock prices for a company over 5 consecutive days. Write a for loop to calculate the sum of all price changes between each consecutive day. Store the result in a variable named `price_changes_sum`.

- [x] The sum is calculated by subtracting each day's price from the previous day's price.
- [ ] The sum is the difference between the last and first price in the list.
- [ ] The sum is calculated by adding up all the prices in the list.


In [8]:
generate_and_parse_question(SingleSelection, "topic: probability and difficulty 3")

any intermediate_steps?:  False
output:
 {
  "question_text": "You are given the probability distribution of a discrete random variable X, represented by the following Python dictionary:\n\n```python\nP_X = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4}\n```\n\nWhich of the following expressions would correctly calculate the expected value (mean) of X?",
  "difficulty": 3,
  "topics": ["probability", "expected value", "python"],
  "choices": [
    "sum([x * prob for x, prob in P_X.items()])",
    "sum([prob / x for x, prob in P_X.items() if x != 0])",
    "sum([x * P_X[x] for x in P_X])",
    "sum([P_X[x] / x for x in P_X if x != 0])"
  ],
  "solution": 2
}





You are given the probability distribution of a discrete random variable X, represented by the following Python dictionary:

```python
P_X = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4}
```

Which of the following expressions would correctly calculate the expected value (mean) of X?

- [ ] sum([x * prob for x, prob in P_X.items()])
- [ ] sum([prob / x for x, prob in P_X.items() if x != 0])
- [x] sum([x * P_X[x] for x in P_X])
- [ ] sum([P_X[x] / x for x in P_X if x != 0])


In [16]:
generate_and_parse_question(FillInBlank, "topic: probability and difficulty 1")

Suppose you have a list of outcomes from a fair six-sided die roll: `[1, 2, 3, 4, 5, 6]`. Write a Python function `calculate_probability` that takes a list of outcomes and a specific outcome, and returns the probability of that outcome. Use the function to calculate the probability of rolling a `4`.

```python
def calculate_probability(outcomes, outcome):
    # Your code here
    probability = ___X
    return probability

# Calculate the probability of rolling a 4
outcomes = [1, 2, 3, 4, 5, 6]
result = calculate_probability(outcomes, ___X)
```

**Solution**

[1 / len(outcomes), 4]
```

**Rendered Solution**

```python
def calculate_probability(outcomes, outcome):
    # Your code here
    probability = 1 / len(outcomes)
    return probability

# Calculate the probability of rolling a 4
outcomes = [1, 2, 3, 4, 5, 6]
result = calculate_probability(outcomes, 4)
```

**Test Suite**

```python
# No setup code required

def calculate_probability(outcomes, outcome):
    # Your code here
    probability = 1 / len(outcomes)
    return probability

# Calculate the probability of rolling a 4
outcomes = [1, 2, 3, 4, 5, 6]
result = calculate_probability(outcomes, 4)

assert calculate_probability([1, 2, 3, 4, 5, 6], 4) == 1/6
```

In [14]:
generate_and_parse_question(FillInBlank, "Give more difficult question")

any intermediate_steps?:  False
output:
 {
  "question_text": "Given a pandas DataFrame `df` with a column 'A' containing integers, write a function `calculate_stats` that takes the DataFrame as input and returns a dictionary with keys 'mean', 'median', and 'std' corresponding to the mean, median, and standard deviation of the values in column 'A'. Use the appropriate pandas DataFrame methods to calculate these statistics.",
  "difficulty": 4,
  "topics": ["pandas", "data analysis", "statistics"],
  "starting_code": "import pandas as pd\n\ndef calculate_stats(df):\n    # Your code here\n    stats = {'mean': ___X, 'median': ___X, 'std': ___X}\n    return stats",
  "solution": ["df['A'].mean()", "df['A'].median()", "df['A'].std()"],
  "setup_code": "import pandas as pd\n\ndf = pd.DataFrame({'A': [1, 2, 3, 4, 5]})",
  "test_code": "assert calculate_stats(df) == {'mean': df['A'].mean(), 'median': df['A'].median(), 'std': df['A'].std()}"
}





Given a pandas DataFrame `df` with a column 'A' containing integers, write a function `calculate_stats` that takes the DataFrame as input and returns a dictionary with keys 'mean', 'median', and 'std' corresponding to the mean, median, and standard deviation of the values in column 'A'. Use the appropriate pandas DataFrame methods to calculate these statistics.

```python
import pandas as pd

def calculate_stats(df):
    # Your code here
    stats = {'mean': ___X, 'median': ___X, 'std': ___X}
    return stats
```

**Solution**

[df['A'].mean(), df['A'].median(), df['A'].std()]
```

**Rendered Solution**

```python
import pandas as pd

def calculate_stats(df):
    # Your code here
    stats = {'mean': df['A'].mean(), 'median': df['A'].median(), 'std': df['A'].std()}
    return stats
```

**Test Suite**

```python
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3, 4, 5]})

import pandas as pd

def calculate_stats(df):
    # Your code here
    stats = {'mean': df['A'].mean(), 'median': df['A'].median(), 'std': df['A'].std()}
    return stats

assert calculate_stats(df) == {'mean': df['A'].mean(), 'median': df['A'].median(), 'std': df['A'].std()}
```

In [15]:
generate_and_parse_question(FillInBlank, "Give me a difficulty 2 question on topic regression")

any intermediate_steps?:  False
output:
 {
  "question_text": "Suppose you have a dataset with one feature `X` and a target variable `Y`. You want to fit a simple linear regression model `Y = a * X + b`. Given the numpy arrays `X` and `Y`, fill in the blanks to calculate the slope `a` and intercept `b` of the regression line using the least squares method.",
  "difficulty": 2,
  "topics": ["regression", "linear algebra", "numpy"],
  "starting_code": "import numpy as np\n\nX = np.array(___X)\nY = np.array(___X)\n\n# Add a column of ones to X to account for the intercept\nX_ones = np.column_stack((np.ones(X.shape[0]), X))\n\n# Calculate the coefficients using the least squares method\na, b = np.linalg.lstsq(X_ones, Y, rcond=None)[0]",
  "solution": ["[1, 2, 3, 4, 5]", "[2, 3, 5, 7, 11]"],
  "setup_code": "import numpy as np",
  "test_code": "assert np.allclose(a, 2.2) and np.allclose(b, -0.6)"
}





Suppose you have a dataset with one feature `X` and a target variable `Y`. You want to fit a simple linear regression model `Y = a * X + b`. Given the numpy arrays `X` and `Y`, fill in the blanks to calculate the slope `a` and intercept `b` of the regression line using the least squares method.

```python
import numpy as np

X = np.array(___X)
Y = np.array(___X)

# Add a column of ones to X to account for the intercept
X_ones = np.column_stack((np.ones(X.shape[0]), X))

# Calculate the coefficients using the least squares method
a, b = np.linalg.lstsq(X_ones, Y, rcond=None)[0]
```

**Solution**

[[1, 2, 3, 4, 5], [2, 3, 5, 7, 11]]
```

**Rendered Solution**

```python
import numpy as np

X = np.array([1, 2, 3, 4, 5])
Y = np.array([2, 3, 5, 7, 11])

# Add a column of ones to X to account for the intercept
X_ones = np.column_stack((np.ones(X.shape[0]), X))

# Calculate the coefficients using the least squares method
a, b = np.linalg.lstsq(X_ones, Y, rcond=None)[0]
```

**Test Suite**

```python
import numpy as np

import numpy as np

X = np.array([1, 2, 3, 4, 5])
Y = np.array([2, 3, 5, 7, 11])

# Add a column of ones to X to account for the intercept
X_ones = np.column_stack((np.ones(X.shape[0]), X))

# Calculate the coefficients using the least squares method
a, b = np.linalg.lstsq(X_ones, Y, rcond=None)[0]

assert np.allclose(a, 2.2) and np.allclose(b, -0.6)
```

In [11]:
generate_and_parse_question(SingleSelection, "Give me a difficulty 3 question on regression")

any intermediate_steps?:  True
output:
 {
  "question_text": "In the context of regression analysis, what does the Mean Squared Error (MSE) loss function measure?",
  "difficulty": 3,
  "topics": ["regression", "loss functions", "MSE"],
  "choices": [
    "MSE measures the average squared difference between the estimated values and the actual value.",
    "MSE measures the average absolute difference between the estimated values and the actual value.",
    "MSE measures the proportion of variance explained by the model.",
    "MSE measures the correlation between the estimated values and the actual values."
  ],
  "solution": 0
}





In the context of regression analysis, what does the Mean Squared Error (MSE) loss function measure?

- [x] MSE measures the average squared difference between the estimated values and the actual value.
- [ ] MSE measures the average absolute difference between the estimated values and the actual value.
- [ ] MSE measures the proportion of variance explained by the model.
- [ ] MSE measures the correlation between the estimated values and the actual values.


In [18]:
generate_and_parse_question(SingleSelection, "More difficult question")

any intermediate_steps?:  False
output:
 {
    "question_text": "Given a DataFrame `df` with a column 'A' containing integers, write a lambda function that will be applied to column 'A' using the `apply` method to find the square of each number. Store the result in a new column 'B'.",
    "difficulty": 3,
    "topics": ["pandas", "dataframe", "lambda functions"],
    "choices": [
        "df['B'] = df['A'].apply(lambda x: x ** 2)",
        "df['B'] = df['A'].apply(lambda x: x * x)",
        "df['B'] = df.apply(lambda x: x['A'] ** 2, axis=1)",
        "df['B'] = df['A'] * df['A']"
    ],
    "solution": 0
}





Given a DataFrame `df` with a column 'A' containing integers, write a lambda function that will be applied to column 'A' using the `apply` method to find the square of each number. Store the result in a new column 'B'.

- [x] df['B'] = df['A'].apply(lambda x: x ** 2)
- [ ] df['B'] = df['A'].apply(lambda x: x * x)
- [ ] df['B'] = df.apply(lambda x: x['A'] ** 2, axis=1)
- [ ] df['B'] = df['A'] * df['A']
