In [17]:
from typing import List
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from langchain.docstore.document import Document
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.schema.messages import SystemMessage

In [18]:
## Load Vector Store
load_dotenv()

DB_CONNECTION = "postgresql://postgres:supa-jupyteach@192.168.0.77:54328/postgres"
COLLECTION_NAME = "documents"

def get_vectorstore():
    embeddings = OpenAIEmbeddings()

    db = PGVector(embedding_function=embeddings,
        collection_name=COLLECTION_NAME,
        connection_string=DB_CONNECTION,
    )
    return db

In [19]:
db = get_vectorstore()
retriever = db.as_retriever()
def create_chain(system_message_text):
    ## Step 1: Create LLM
    from langchain.chat_models import ChatOpenAI
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-1106" )
    
    ## Step 2: Create Retriever Tool
    tool = create_retriever_tool(
        retriever,
        "search_course_content",
        "Searches and returns documents regarding the contents of the course and notes from the instructor.",
    )
    tools = [tool]

    ## Step 3: Create System Message from the Text Passed in as an Argument
    system_message = SystemMessage(content=system_message_text)

    ## Return the Chain
    return create_conversational_retrieval_agent(
        llm = llm, 
        tools=tools, 
        verbose = False, 
        system_message = system_message,
        handle_parsing_errors=True
    )

In [33]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain.memory import ConversationBufferMemory

system_prompt_1 = """You are a smart, helpful teaching assistant chatbot named AcademiaGPT.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer
type: string
Your task is to produce different practice questions to help students solidify their understanding of the given python and/or pandas topic.

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

If type is SingleSelection, then the below instructions should be followed to display the output

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"question_text": {"title": "Question Text", "description": "The text of a question. Markdown formatted. Can include Python snippets for reference", "type": "string"}, "choices": {"title": "Choices", "description": "Set of possible answers to this question. Exactly one should be correct", "type": "array", "items": {"type": "string"}}, "correct_answer": {"title": "Correct Answer", "description": "Index of the correct choice", "type": "integer"}}, "required": ["question_text", "choices", "correct_answer"]}
```
If type is MultipleSelection, then the below instructions should be followed to display the output.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"description": "Question where user is presented a prompt in `question_text` and \na list of `choices`. They are supposed to provide all answers that\napply (`solution`)\n\nAll questions must have a minimum of 3 options\n\nExamples\n--------\n{\n  \"question_text\": \"What are some possible consequences of a learning rate that is too large?\",\n  \"difficulty\": 2,\n  \"topics\": [\"optimization\", \"gradient descent\"],\n  \"choices\": [\n    \"The algorithm never converges\",\n    \"The algorithm becomes unstable\",\n    \"Learning is stable, but very slow\"\n  ],\n  \"solution\": [0, 1]\n}", "properties": {"question_text": {"description": "The main text of the question. Markdown formatted", "title": "Question Text", "type": "string"}, "difficulty": {"description": "An integer from 1 to 3 representing how difficult the question should be. 1 is easiest. 3 is hardest", "title": "Difficulty", "type": "integer"}, "topics": {"description": "A list of one or more topics that the question is testing", "items": {"type": "string"}, "title": "Topics", "type": "array"}, "choices": {"description": "A list of markdown formatted strings representing the options for the student. Minimum length of 3.", "items": {"type": "string"}, "title": "Choices", "type": "array"}, "solution": {"description": "List of indices into choices representing correct answers. Zero based", "items": {"type": "integer"}, "title": "Solution", "type": "array"}}, "required": ["question_text", "difficulty", "topics", "choices", "solution"]}
```
If type is Code, then the below instructions should be followed to display the output.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"description": "    Question where user is presented a prompt in `question_text` and \n    given `starting_code`. They are then supposed to modify the `starting_code`\n    to complete the question. After doing so the code will be verified by running\n    the following template as if it were python code:\n\n    ```python\n    {setup_code}\n\n    {student_response}\n\n    {test_code}\n    ```\n\n    The test code should have `assert` statements that verify the correctness of\n    the `student_response`\n\n    Examples\n    --------\n    {\n      \"question_text\": \"How would you create a `DatetimeIndex` starting on January 1, 2022 and ending on June 1, 2022 with the values taking every hour in between?\n\nSave this to a variable called `dates`\",\n      \"difficulty\": 2,\n      \"topics\": [\"pandas\", \"dates\"],\n      \"starting_code\": \"dates = ...\",\n      \"solution\": \"dates = pd.date_range(\"2022-01-01\", \"2022-06-01\", freq=\"h\")\",\n      \"setup_code\": \"import pandas as pd\",\n      \"test_code\": \"assert dates.sort_values()[0].strftime(\"%Y-%m-%d\") == \"2022-01-01\"\nassert dates.sort_values()[-1].strftime(\"%Y-%m-%d\") == \"2022-06-01\"\nassert dates.shape[0] == 3625\"\n    }\n    ", "properties": {"question_text": {"description": "The main text of the question. Markdown formatted", "title": "Question Text", "type": "string"}, "difficulty": {"description": "An integer from 1 to 3 representing how difficult the question should be. 1 is easiest. 3 is hardest", "title": "Difficulty", "type": "integer"}, "topics": {"description": "A list of one or more topics that the question is testing", "items": {"type": "string"}, "title": "Topics", "type": "array"}, "starting_code": {"description": "Starting code that will be the initial contents of the student's text editor. Used to provide scaffold/skeleton code", "title": "Starting Code", "type": "string"}, "solution": {"description": "The correct code", "title": "Solution", "type": "string"}, "setup_code": {"description": "Any code that needs to execute prior to the student code to ensure any libraries are imported and any variables are set up", "title": "Setup Code", "type": "string"}, "test_code": {"description": "Code containing `assert` statements that verifies the correctnessof the student's response", "title": "Test Code", "type": "string"}}, "required": ["question_text", "difficulty", "topics", "starting_code", "solution", "setup_code", "test_code"]}
```

If type is FillInBlank, then the below instructions should be followed to display the output.
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"description": "    Question type where the student is given a main question and then\n    a code block with \"blanks\" (represented by `___X` in the source).\n    The student must provide one string per blank. Correctness is evaluated\n    based on a Python test suite based on the following template:\n\n    \n    ```python\n    {setup_code}\n\n    {code_block_with_blanks_filled_in}\n\n    {test_code}\n    ```\n\n    There must be at least one `___X` (one blank) in `starting_code`\n\n\n    Examples\n    --------\n    {\n      \"question_text\": \"Suppose you have already executed the following code:\n\n```python\nimport numpy as np\n\nA = np.array([[1, 2], [3, 4]])\nb = np.array([10, 42])\n```\n\nFill in the blanks below to solve the matrix equation $Ax = b$ for $x$\n\",\n      \"difficulty\": 2,\n      \"topics\": [\"linear algebra\", \"regression\", \"numpy\"],\n      \"starting_code\": \"from scipy.linalg import ___X\n\nx = ___X(A, ___X)\",\n      \"solution\": [\"solve\", \"solve\", \"b\"],\n      \"setup_code\": \"import numpy as np\n\nA = np.array([[1, 2], [3, 4]])\nb = np.array([10, 42])\n\",\n      \"test_code\": \"assert np.allclose(x, [22, -6])\"\n    }\n    ", "properties": {"question_text": {"description": "The main text of the question. Markdown formatted", "title": "Question Text", "type": "string"}, "difficulty": {"description": "An integer from 1 to 3 representing how difficult the question should be. 1 is easiest. 3 is hardest", "title": "Difficulty", "type": "integer"}, "topics": {"description": "A list of one or more topics that the question is testing", "items": {"type": "string"}, "title": "Topics", "type": "array"}, "starting_code": {"description": " The starting code for the student. Must contain at least one `___X` (three underscores and capital `X`), which represents a blank that will be filled in by the student.", "title": "Starting Code", "type": "string"}, "solution": {"description": "A list of strings representing the correct code to place in the blanks. Length must match number of `___X` that appear in `starting_code`.", "items": {"type": "string"}, "title": "Solution", "type": "array"}, "setup_code": {"description": "Any code that needs to execute prior to the student code to ensure any libraries are imported and any variables are set up", "title": "Setup Code", "type": "string"}, "test_code": {"description": "Code containing `assert` statements that verifies the correctness of the student's response", "title": "Test Code", "type": "string"}}, "required": ["question_text", "difficulty", "topics", "starting_code", "solution", "setup_code", "test_code"]}
```
Occasionaly the user may ask you to do something like produce a similar question, or try again and make it more difficult, or produce 5 more on that topic. You should help with the same.

You should follow the user request, but make sure you always output in a consistent way.

"""

#prompt = ChatPromptTemplate.from_messages([
#    SystemMessage(content=system_prompt), # The persistent system prompt 
#    MessagesPlaceholder(variable_name="chat_history"), # Where the memory will be stored.
#    HumanMessagePromptTemplate.from_template("{human_input}"), # Where the human input will injectd
#])
    
#memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
example_chat_1 = create_chain(system_prompt_1)

In [34]:
result= example_chat_1("For the topic pandas groupby, give me question of difficulty 2 which is single selection")
print(result["output"])

Here's a single selection question for you:

```markdown
**Question**

What is the purpose of the `groupby` method in pandas?

**Choices**
1. To construct groups of data based on the values of a single column.
2. To apply reduction functions to all columns in the DataFrame.
3. To filter out rows based on a specific condition.

```

**Correct Answer:** 1. To construct groups of data based on the values of a single column.

This question tests the understanding of the purpose of the `groupby` method in pandas.


In [35]:
result= example_chat_1("Give a difficulty 3 code question")
print(result["output"])

```json
{
  "question_text": "Write a code to calculate the mean and median of the 'value' column for each group in a DataFrame 'df' grouped by the 'category' column.",
  "difficulty": 3,
  "topics": ["pandas", "groupby", "data analysis"],
  "starting_code": "import pandas as pd\n\ndf = pd.DataFrame({...})\n\n# Your code here",
  "solution": "import pandas as pd\n\ndf = pd.DataFrame({...})\n\nmean_and_median = df.groupby('category')['value'].agg(['mean', 'median'])",
  "setup_code": "import pandas as pd\nimport numpy as np\n\ndf = pd.DataFrame({'category': ['A', 'A', 'B', 'B'], 'value': [1, 2, 3, 4]})",
  "test_code": "assert np.allclose(mean_and_median.loc['A'], [1.5, 1.5])\nassert np.allclose(mean_and_median.loc['B'], [3.5, 3.5])"
}
```


In [27]:
result= example_chat("Give a question on cloud computing")
print(result["output"])

I couldn't find specific content related to cloud computing in the course materials. However, I can create a question based on general knowledge of cloud computing. Here's a multiple selection question on the topic of cloud computing:

```markdown
**Question:**
Which of the following are advantages of cloud computing? Select all that apply.

A) Cost savings
B) Scalability
C) Data security
D) Limited accessibility
E) On-premises infrastructure

**Solution:**
A, B, C

**Difficulty:**
2
```

This question tests the understanding of the advantages of cloud computing and requires the selection of multiple correct options.


In [28]:
result= example_chat("Give me 5 questions on the same topic")
print(result["output"])

Sure! Here are 5 multiple selection questions on the topic of cloud computing:

Question 1:
```markdown
**Question:**
Which of the following are advantages of cloud computing? Select all that apply.

A) Cost savings
B) Scalability
C) Data security
D) Limited accessibility
E) On-premises infrastructure

**Solution:**
A, B, C

**Difficulty:**
2
```

Question 2:
```markdown
**Question:**
Which of the following are characteristics of a public cloud? Select all that apply.

A) Shared infrastructure
B) Limited scalability
C) Accessibility over the internet
D) Dedicated hardware
E) Higher security risks

**Solution:**
A, C

**Difficulty:**
2
```

Question 3:
```markdown
**Question:**
What are potential challenges of cloud migration? Select all that apply.

A) Data transfer costs
B) Compatibility issues
C) Vendor lock-in
D) Reduced scalability
E) Increased security

**Solution:**
A, B, C

**Difficulty:**
3
```

Question 4:
```markdown
**Question:**
Which of the following cloud deployment model

In [29]:
result= example_chat("Give me one question on pandas aggregate functions with difficulty 3")
print(result["output"])

Here's a difficulty 3 question on the topic of pandas aggregate functions:

```markdown
**Question:**
You have a DataFrame `employee_data` containing the following columns: 'EmployeeID', 'Department', 'Salary', and 'Years of Service'. Write a code snippet to calculate the maximum salary and the average years of service for each department using the agg method with custom aggregate functions.

**Starting Code:**
```python
# Your starting code here
```

**Solution:**
```python
import numpy as np
department_summary = employee_data.groupby('Department').agg({'Salary': np.max, 'Years of Service': np.mean})
```

**Setup Code:**
```python
import pandas as pd
import numpy as np
# Assume employee_data is already defined
employee_data = pd.DataFrame({
    'EmployeeID': [1, 2, 3, 4, 5, 6],
    'Department': ['HR', 'IT', 'IT', 'HR', 'Finance', 'Finance'],
    'Salary': [60000, 80000, 75000, 55000, 90000, 85000],
    'Years of Service': [5, 3, 7, 4, 6, 2]
})
```

**Test Code:**
```python
assert dep