In [1]:
import dotenv
dotenv.load_dotenv();

In [10]:
dotenv.load_dotenv("/home/jupyteach-msda/jupyteach-ai")

False

In [21]:
from langchain.prompts import (
    PromptTemplate,
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.callbacks import StdOutCallbackHandler
from langchain.callbacks.base import BaseCallbackHandler

from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from typing import List
import dotenv
dotenv.load_dotenv();

# import langchain
# from langchain.cache import SQLiteCache
# langchain.llm_cache = SQLiteCache(database_path=".langchain.db")

In [2]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain.memory import ConversationBufferMemory

system_prompt = """You are a smart, helpful teaching assistant chatbot named Callisto.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce a multiple choice practice question to help students solidify their understanding of the given python and/or pandas topic

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"question_text": {"title": "Question Text", "description": "The text of a question. Markdown formatted. Can include Python snippets for reference", "type": "string"}, "choices": {"title": "Choices", "description": "Set of possible answers to this question. Exactly one should be correct", "type": "array", "items": {"type": "string"}}, "correct_answer": {"title": "Correct Answer", "description": "Index of the correct choice", "type": "integer"}}, "required": ["question_text", "choices", "correct_answer"]}
```

The question_text and choices can all be markdown strings, but must be placed inside the JSON object

Occasionaly the user may ask you to do something like produce a similar question, or try again and make it more difficult, or produce 5 more on that topic. 

You should follow the user request, but make sure you always output in a consistent way.

"""

prompt = ChatPromptTemplate.from_messages([
    SystemMessage(content=system_prompt), # The persistent system prompt 
    MessagesPlaceholder(variable_name="chat_history"), # Where the memory will be stored.
    HumanMessagePromptTemplate.from_template("{human_input}"), # Where the human input will injectd
])
    
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

class MultipleChoiceQuestion(BaseModel):
    question_text: str = Field(description="The text of a question. Markdown formatted. Can include Python snippets for reference")
    choices: List[str] = Field(description="Set of possible answers to this question. Exactly one should be correct")
    correct_answer: int = Field(description="Index of the correct choice")

    def __repr__(self):
        choices = "- " + "\n- ".join(self.choices)
        return f"""{self.question_text}\n\n{choices}\n\n{self.correct_answer}"""


multiple_choice_prompt_template = """Produce a multiple choice pratice question to help the user solidify their understanding of {category}.

The question should be rated at a difficulty level of {difficulty}/3, with 1 being the easiest and 3 being the hardest

The question_text and choices can all be markdown strings, but must be placed inside the JSON object

{format_instructions}

The topic is: {topic}
"""


mcparser = PydanticOutputParser(pydantic_object=MultipleChoiceQuestion)
mcprompt = PromptTemplate(
    template=multiple_choice_prompt_template,
    input_variables=["topic", "difficulty", "category"],
    partial_variables={"format_instructions": mcparser.get_format_instructions()},
)

## Chat Example

In [3]:
chat = ChatOpenAI()

chat_llm_chain = LLMChain(
    llm=chat,
    prompt=prompt,
    verbose=True,
    memory=memory,
)

In [4]:
x = chat_llm_chain.predict(human_input="topic: using stack, unstack, set_index, and reset_index to produce tidy form data\ndifficulty: 3")
q = mcparser.parse(x)
q



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a smart, helpful teaching assistant chatbot named Callisto.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce a multiple choice practice question to help students solidify their understanding of the given python and/or pandas topic

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array

Which of the following methods can be used to transform a DataFrame into tidy form data?

- stack
- unstack
- set_index
- reset_index
- all of the above

4

In [5]:
x2 = chat_llm_chain.predict(human_input="Great, please produce another question on the same topic")
q2 = mcparser.parse(x2)
q2



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a smart, helpful teaching assistant chatbot named Callisto.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce a multiple choice practice question to help students solidify their understanding of the given python and/or pandas topic

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array

Which of the following methods can be used to transform tidy form data into a DataFrame?

- stack
- unstack
- set_index
- reset_index
- none of the above

4

In [6]:
x3 = chat_llm_chain.predict(human_input="one more please. Add some variety")
q3 = mcparser.parse(x3)
q3



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a smart, helpful teaching assistant chatbot named Callisto.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce a multiple choice practice question to help students solidify their understanding of the given python and/or pandas topic

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array

Which of the following methods can be used to transform a MultiIndex DataFrame into a single-level index DataFrame?

- stack
- unstack
- set_index
- reset_index
- none of the above

3

In [7]:
x4 = chat_llm_chain.predict(human_input="Great, one more with a code snippet as part of the question prompt")
q4 = mcparser.parse(x4)
q4



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a smart, helpful teaching assistant chatbot named Callisto.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce a multiple choice practice question to help students solidify their understanding of the given python and/or pandas topic

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array

Consider the following DataFrame:

```python
data = {'Name': ['John', 'Alice', 'Bob', 'Emma'],
        'Subject': ['Math', 'English', 'Science', 'History'],
        'Score': [90, 85, 95, 80]}

df = pd.DataFrame(data)
```

Which of the following methods can be used to transform the DataFrame `df` into tidy form data?

- stack
- unstack
- set_index
- reset_index
- all of the above

4

In [8]:
x5 = chat_llm_chain.predict(human_input="That's great, one more like that please")
q5 = mcparser.parse(x5)
q5



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a smart, helpful teaching assistant chatbot named Callisto.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce a multiple choice practice question to help students solidify their understanding of the given python and/or pandas topic

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array

Consider the following DataFrame:

```python
data = {'Name': ['John', 'Alice', 'Bob', 'Emma'],
        'Subject': ['Math', 'English', 'Science', 'History'],
        'Score': [90, 85, 95, 80]}

df = pd.DataFrame(data)
```

Which of the following methods can be used to transform the tidy form data into the original DataFrame `df`?

- stack
- unstack
- set_index
- reset_index
- none of the above

4

In [9]:
x6 = chat_llm_chain.predict(human_input="Try that same question again, but don't use the word `unstack`. Instead talk about rotating row labels up to be column labels")
q6 = mcparser.parse(x6)
q6



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a smart, helpful teaching assistant chatbot named Callisto.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce a multiple choice practice question to help students solidify their understanding of the given python and/or pandas topic

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array

Consider the following DataFrame:

```python
data = {'Name': ['John', 'Alice', 'Bob', 'Emma'],
        'Subject': ['Math', 'English', 'Science', 'History'],
        'Score': [90, 85, 95, 80]}

df = pd.DataFrame(data)
```

Which of the following methods can be used to rotate the row labels up to be column labels in the DataFrame `df`?

- stack
- pivot
- set_index
- reset_index
- none of the above

1

In [10]:
x7 = chat_llm_chain.predict(human_input="topic: for loops\ncategory: python, control-flow\ndifficulty: 3")
q7 = mcparser.parse(x7)
q7



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a smart, helpful teaching assistant chatbot named Callisto.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce a multiple choice practice question to help students solidify their understanding of the given python and/or pandas topic

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array

Which of the following statements best describes a for loop in Python?

- A for loop is used to iterate over a sequence (such as a list, tuple, or string) or other iterable object
- A for loop is used to execute a block of code repeatedly based on a condition
- A for loop is used to define a function in Python
- A for loop is used to perform arithmetic operations in Python

0

In [11]:
x8 = chat_llm_chain.predict(human_input="That's great, one more like that please")
q8 = mcparser.parse(x8)
q8



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a smart, helpful teaching assistant chatbot named Callisto.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce a multiple choice practice question to help students solidify their understanding of the given python and/or pandas topic

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array

Which of the following is the correct syntax for a for loop in Python?

- for i in range(10):
- for i in range(1, 10, 2):
- for i in [1, 2, 3, 4]:
- for i in 'hello':

2

In [12]:
x9 = chat_llm_chain.predict(human_input="Let's ask another one on the same topic, but require them to reason about how loop can be used to update variables")
q9 = mcparser.parse(x9)
q9



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a smart, helpful teaching assistant chatbot named Callisto.

You assist professors that teach courses about Python and pandas to economics students. 

You have 5+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce a multiple choice practice question to help students solidify their understanding of the given python and/or pandas topic

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array

Consider the following code snippet:

```python
numbers = [1, 2, 3, 4, 5]
result = 0

for num in numbers:
    result += num

print(result)
```

What is the value of `result` after executing this code?

- 0
- 1
- 5
- 15
- 25

3

## Single Question example

In [13]:
class QuestionGenerator:

    def __init__(self, model=None):
        # TODO: set up sqlite based history
        # TODO: make it a chat model so we can ask for follow up questions
        # TODO: prep for MultipleSelection and code type questions
        # TODO: possibly provide some examples jupyteach question database
        if model is None:
            model_name = "text-davinci-003"
            temperature = 0.8
            self.model = OpenAI(model_name=model_name, temperature=temperature)
        else:
            self.model = model

        self.mcparser = PydanticOutputParser(pydantic_object=MultipleChoiceQuestion)
        self.mcprompt = PromptTemplate(
            template=multiple_choice_prompt_template,
            input_variables=["topic", "difficulty", "category"],
            partial_variables={"format_instructions": self.mcparser.get_format_instructions()},
        )
        self.mcchain = LLMChain(
            llm=self.model,
            prompt=self.mcprompt,
            output_parser=self.mcparser,
        )

    def multiple_choice_writer(self, topic: str, category: str="python programming", difficulty: int=2):
        assert difficulty > 0 and difficulty < 4        
        return self.mcchain.run(topic=topic, category=category, difficulty=difficulty)

    def many_multiple_choice(self, topic: str, category: str="python programming", difficulty: int=2, N:int =10):
        res = self.mcchain.generate([dict(topic=topic, category=category, difficulty=difficulty)]*10)
        qs = [self.mcparser.parse(x[0].text) for x in res.generations]
        return qs

In [14]:
g = QuestionGenerator()

In [15]:
g.multiple_choice_writer(topic="groupby operations", category="the python pandas library", difficulty=3);

In [17]:
q = g.multiple_choice_writer("DataFrame string methods and data cleaning", "python/pandas", 1)

In [18]:
q

What is the best option for cleaning strings in a Pandas DataFrame?

- Use the `.strip()` method
- Use the `.replace()` method
- Use the `.lower()` method
- Use the `.upper()` method

0

In [19]:
q = g.multiple_choice_writer("for loops", "python control-flow", 3)

In [20]:
print(q.question_text)

What is the most common way to structure a for loop in Python?
