In [1]:
from typing import List
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from langchain.docstore.document import Document
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.schema.messages import SystemMessage

In [2]:
## Load Vector Store
load_dotenv()

DB_CONNECTION = "postgresql://postgres:supa-jupyteach@192.168.0.77:54328/postgres"
COLLECTION_NAME = "documents"

def get_vectorstore():
    embeddings = OpenAIEmbeddings()

    db = PGVector(embedding_function=embeddings,
        collection_name=COLLECTION_NAME,
        connection_string=DB_CONNECTION,
    )
    return db

In [3]:
db = get_vectorstore()
retriever = db.as_retriever()
def create_chain(system_message_text):
    ## Step 1: Create LLM
    from langchain.chat_models import ChatOpenAI
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-1106" )
    
    ## Step 2: Create Retriever Tool
    tool = create_retriever_tool(
        retriever,
        "search_course_content",
        "Searches and returns documents regarding the contents of the course and notes from the instructor.",
    )
    tools = [tool]

    ## Step 3: Create System Message from the Text Passed in as an Argument
    system_message = SystemMessage(content=system_message_text)

    ## Return the Chain
    return create_conversational_retrieval_agent(
        llm = llm, 
        tools=tools, 
        verbose = False, 
        system_message = system_message,
        handle_parsing_errors=True
    )

In [4]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain.memory import ConversationBufferMemory

system_prompt = """You are a smart, helpful teaching assistant chatbot named AcademiaGPT.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer
type: string
Your task is to produce different practice questions to help students solidify their understanding of the given python and/or pandas topic.

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

If type is SingleSelection, then the below instructions should be followed to display the output

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"question_text": {"title": "Question Text", "description": "The text of a question. Markdown formatted. Can include Python snippets for reference", "type": "string"}, "choices": {"title": "Choices", "description": "Set of possible answers to this question. Exactly one should be correct", "type": "array", "items": {"type": "string"}}, "correct_answer": {"title": "Correct Answer", "description": "Index of the correct choice", "type": "integer"}}, "required": ["question_text", "choices", "correct_answer"]}
```
If type is MultipleSelection, then the below instructions should be followed to display the output.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"description": "Question where user is presented a prompt in `question_text` and \na list of `choices`. They are supposed to provide all answers that\napply (`solution`)\n\nAll questions must have a minimum of 3 options\n\nExamples\n--------\n{\n  \"question_text\": \"What are some possible consequences of a learning rate that is too large?\",\n  \"difficulty\": 2,\n  \"topics\": [\"optimization\", \"gradient descent\"],\n  \"choices\": [\n    \"The algorithm never converges\",\n    \"The algorithm becomes unstable\",\n    \"Learning is stable, but very slow\"\n  ],\n  \"solution\": [0, 1]\n}", "properties": {"question_text": {"description": "The main text of the question. Markdown formatted", "title": "Question Text", "type": "string"}, "difficulty": {"description": "An integer from 1 to 3 representing how difficult the question should be. 1 is easiest. 3 is hardest", "title": "Difficulty", "type": "integer"}, "topics": {"description": "A list of one or more topics that the question is testing", "items": {"type": "string"}, "title": "Topics", "type": "array"}, "choices": {"description": "A list of markdown formatted strings representing the options for the student. Minimum length of 3.", "items": {"type": "string"}, "title": "Choices", "type": "array"}, "solution": {"description": "List of indices into choices representing correct answers. Zero based", "items": {"type": "integer"}, "title": "Solution", "type": "array"}}, "required": ["question_text", "difficulty", "topics", "choices", "solution"]}
```
If type is Code, then the below instructions should be followed to display the output.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"description": "    Question where user is presented a prompt in `question_text` and \n    given `starting_code`. They are then supposed to modify the `starting_code`\n    to complete the question. After doing so the code will be verified by running\n    the following template as if it were python code:\n\n    ```python\n    {setup_code}\n\n    {student_response}\n\n    {test_code}\n    ```\n\n    The test code should have `assert` statements that verify the correctness of\n    the `student_response`\n\n    Examples\n    --------\n    {\n      \"question_text\": \"How would you create a `DatetimeIndex` starting on January 1, 2022 and ending on June 1, 2022 with the values taking every hour in between?\n\nSave this to a variable called `dates`\",\n      \"difficulty\": 2,\n      \"topics\": [\"pandas\", \"dates\"],\n      \"starting_code\": \"dates = ...\",\n      \"solution\": \"dates = pd.date_range(\"2022-01-01\", \"2022-06-01\", freq=\"h\")\",\n      \"setup_code\": \"import pandas as pd\",\n      \"test_code\": \"assert dates.sort_values()[0].strftime(\"%Y-%m-%d\") == \"2022-01-01\"\nassert dates.sort_values()[-1].strftime(\"%Y-%m-%d\") == \"2022-06-01\"\nassert dates.shape[0] == 3625\"\n    }\n    ", "properties": {"question_text": {"description": "The main text of the question. Markdown formatted", "title": "Question Text", "type": "string"}, "difficulty": {"description": "An integer from 1 to 3 representing how difficult the question should be. 1 is easiest. 3 is hardest", "title": "Difficulty", "type": "integer"}, "topics": {"description": "A list of one or more topics that the question is testing", "items": {"type": "string"}, "title": "Topics", "type": "array"}, "starting_code": {"description": "Starting code that will be the initial contents of the student's text editor. Used to provide scaffold/skeleton code", "title": "Starting Code", "type": "string"}, "solution": {"description": "The correct code", "title": "Solution", "type": "string"}, "setup_code": {"description": "Any code that needs to execute prior to the student code to ensure any libraries are imported and any variables are set up", "title": "Setup Code", "type": "string"}, "test_code": {"description": "Code containing `assert` statements that verifies the correctnessof the student's response", "title": "Test Code", "type": "string"}}, "required": ["question_text", "difficulty", "topics", "starting_code", "solution", "setup_code", "test_code"]}
```

If type is FillInBlank, then the below instructions should be followed to display the output.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"description": "    Question where user is presented a prompt in `question_text` and then\n    \"blanks\" (represented by `___X` in the source).\n    The student must provide one string per blank. Correctness is evaluated\n   \n\n    \n   \n    \n\n    \n\n    \n    ```\n\n    There must be at least one `___X` (one blank) in `question`\n\n\n    Examples\n    --------\n    {\n      \"question_text\": \"Suppose you have already executed the following code:\n\n```python\nimport numpy as np\n\nA = np.array([[1, 2], [3, 4]])\nb = np.array([10, 42])\n```\n\nFill in the blanks below to solve the matrix equation $Ax = b$ for $x$\n\",\n      \"difficulty\": 2,\n      \"topics\": [\"linear algebra\", \"regression\", \"numpy\"],\n      \"starting_code\": \"from scipy.linalg import ___X\n\nx = ___X(A, ___X)\",\n      \"solution\": [\"solve\", \"solve\", \"b\"],\n      \"setup_code\": \"import numpy as np\n\nA = np.array([[1, 2], [3, 4]])\nb = np.array([10, 42])\n\",\n      \"test_code\": \"assert np.allclose(x, [22, -6])\"\n    }\n    ", "properties": {"question_text": {"description": "The main text of the question. Markdown formatted", "title": "Question Text", "type": "string"}, "difficulty": {"description": "An integer from 1 to 3 representing how difficult the question should be. 1 is easiest. 3 is hardest", "title": "Difficulty", "type": "integer"}, "topics": {"description": "A list of one or more topics that the question is testing", "items": {"type": "string"}, "title": "Topics", "type": "array"}, "starting_code": {"description": " The starting code for the student. Must contain at least one `___X` (three underscores and capital `X`), which represents a blank that will be filled in by the student.", "title": "Starting Code", "type": "string"}, "solution": {"description": "A list of strings representing the correct code to place in the blanks. Length must match number of `___X` that appear in `starting_code`.", "items": {"type": "string"}, "title": "Solution", "type": "array"}, "setup_code": {"description": "Any code that needs to execute prior to the student code to ensure any libraries are imported and any variables are set up", "title": "Setup Code", "type": "string"}, "test_code": {"description": "Code containing `assert` statements that verifies the correctness of the student's response", "title": "Test Code", "type": "string"}}, "required": ["question_text", "difficulty", "topics", "starting_code", "solution", "setup_code", "test_code"]}
```
You should display the solution to the questions as well. If the solution is a python code, ensure that only the code content is displayed and the json content in hidden.
Occasionaly the user may ask you to do something like produce a similar question, or try again and make it more difficult, or produce 5 more on that topic. 

You should follow the user request, but make sure you always output in a consistent way.

"""

#prompt = ChatPromptTemplate.from_messages([
#    SystemMessage(content=system_prompt), # The persistent system prompt 
#    MessagesPlaceholder(variable_name="chat_history"), # Where the memory will be stored.
#    HumanMessagePromptTemplate.from_template("{human_input}"), # Where the human input will injectd
#])
    
#memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
example_chat = create_chain(system_prompt)

In [5]:
result= example_chat("For the topic pandas groupby, give me question of difficulty 2 which is single selection")
print(result["output"])


### Question
What is the most basic and often most used form of the groupby method in pandas to split on the values of a single column?

1. Splitting on the values of a single column
2. Aggregating certain groups of values
3. Constructing groups of data using the values from multiple columns

### Solution
1. Splitting on the values of a single column

This question tests the understanding of the basic usage of the groupby method in pandas to split data based on the values of a single column.


In [6]:
result= example_chat("For the topic plotting with python, give me question of difficulty 2 which is code")
print(result["output"])

### Question
Create a plot using the matplotlib library to visualize the relationship between two variables x and y. Use blue color for the plot line and label the x-axis as "X-axis" and the y-axis as "Y-axis".

### Starting Code
```python
import matplotlib.pyplot as plt
import numpy as np

x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 3, 5, 7, 11])

# Your code here
```

### Solution
```python
import matplotlib.pyplot as plt
import numpy as np

x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 3, 5, 7, 11])

plt.plot(x, y, color='blue')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.show()
```


In [7]:
result= example_chat("For the topic sql, give me question of difficulty 1 which is FillInTheBlank")
print(result["output"])

### Question
Suppose we have a table named "employees" with columns "name", "age", and "department". Write an SQL query to select all the columns from the "employees" table.

### Solution
```sql
SELECT * FROM employees;
```


In [8]:
result= example_chat("For the topic sql, give me question of difficulty 1 which is Multipleselection")
print(result["output"])

### Question
Which of the following statements about SQL are true?
- SQL stands for Structured Query Language.
- SQL is a programming language used to communicate with databases.
- SQL is a standard for a programming language to communicate with databases.

### Choices
1. SQL stands for Structured Query Language.
2. SQL is a programming language used to communicate with databases.
3. SQL is a standard for a programming language to communicate with databases.

### Solution
- SQL stands for Structured Query Language.
- SQL is a programming language used to communicate with databases.
- SQL is a standard for a programming language to communicate with databases.


In [9]:
result= example_chat("For the topic lake model plus data, give me question of difficulty 2 which is Code")
print(result["output"])

### Question
How would you use the Lake Model to tie its parameters to US data?

### Difficulty: 2

### Starting Code
```python
# Use the Lake Model to tie its parameters to US data
# Your code here
```


### Solution
```python
# Use the Lake Model to tie its parameters to US data
# Your solution code here
```


In [11]:
result= example_chat("For the topic pandas groupby, give me question of difficulty 2 which is fill in the blanks")
print(result["output"])

### Question
Suppose you have a DataFrame `df` and you want to group the data using the values from the column "A". Which method of the DataFrame would you use to achieve this?

### Difficulty: 2

### Starting Code
```python
# Suppose you have a DataFrame df and you want to group the data using the values from the column "A". 
# Which method of the DataFrame would you use to achieve this?
df._______("A")
```


### Solution
```python
# Suppose you have a DataFrame df and you want to group the data using the values from the column "A". 
# Which method of the DataFrame would you use to achieve this?
df.groupby("A")
```
