In [25]:
## Import All Necessary Libraries
import dotenv
from dotenv import load_dotenv
from typing import List, Dict, Any
from langchain.chains import LLMChain
from langchain.schema import LLMResult
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores.pgvector import PGVector
from langchain.memory import ConversationBufferMemory
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.output_parsers import PydanticOutputParser
from langchain.chains import ConversationalRetrievalChain
from langchain.agents.agent_toolkits import create_retriever_tool
from question_generator_model import (
    MultipleSelection, 
    SingleSelection, 
    Code, 
    FillInBlank
)
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)

In [2]:
## Create Output Parsers
ms_parser = PydanticOutputParser(pydantic_object = MultipleSelection)
code_parser = PydanticOutputParser(pydantic_object = Code)
ss_parser = PydanticOutputParser(pydantic_object = SingleSelection)
fib_parser = PydanticOutputParser(pydantic_object = FillInBlank)

In [4]:
## Load Vector Store
# load_dotenv()
#
# DB_CONNECTION = "postgresql://postgres:supa-jupyteach@192.168.0.77:54328/postgres"
# COLLECTION_NAME = "documents"

# def get_vectorstore():
#     embeddings = OpenAIEmbeddings()
#
#    db = PGVector(embedding_function = embeddings,
#        collection_name = COLLECTION_NAME,
#        connection_string = DB_CONNECTION,
#    )
#    return db

In [5]:
## Initialize Retriever
# db = get_vectorstore()
# retriever = db.as_retriever()

In [6]:
#class JupyteachQuestionChain(LLMChain):
#    def create_outputs(self, llm_result: LLMResult) -> List[Dict[str, Any]]:
#        out = super().create_outputs(llm_result)
#        return [
#            {**d, "original_text_response": g[0].text}
#            for (d, g) in zip(out, llm_result.generations)
#        ]
#def build_llm_for_pydantic_model(model_class):
#    parser = PydanticOutputParser(pydantic_object=model_class)
#    system = SystemMessagePromptTemplate.from_template(common_system_prompt)
#    human = HumanMessagePromptTemplate.from_template("{input}")
#   
#    prompt = ChatPromptTemplate(
#        messages=[system, MessagesPlaceholder(variable_name="history"), human],
#        partial_variables={"format_instructions": parser.get_format_instructions()},
#        # output_parser=parser,
#    )  
#    model = ChatOpenAI(temperature=0)
#    memory = ConversationBufferMemory(
#        memory_key="history", 
#        return_messages=True,
#        output_key="original_text_response",
#    )
#    tool = create_retriever_tool(
#        retriever,
#        "search_course_content",
#        "Searches and returns documents regarding the contents of the course and notes from the instructor.",
#    )
#    tools = [tool]
#    return JupyteachQuestionChain(
#        memory=memory,
#        llm=model,
#        prompt=prompt,
#        output_parser=parser,
#        output_key="question",
#        return_final_only=False,
#        tools=tools
#    )

## Prompt 1

In [3]:
## Create System Prompt 1
system_prompt_1 = """You are a smart, helpful teaching assistant chatbot named AcademiaGPT.

You assist college professors that teach courses about about Python, data science, and machine learning to graduate students. 

You have 25+ years of experience writing pandas code to do a variety of data analysis tasks. 

Your responses typically include examples of datasets or code snippets.

For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce practice questions to help students solidify their understanding of the provided topic.

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

If the user asks you for another question and does not specify either a new topic or a new difficulty, you must use the previous topic or difficulty.

Your responses must always exactly match the specified format with no extra words or content.

{format_instructions}
"""

In [9]:
## Create Chain & LLM for Prompt 1 Pydantic Model

dotenv.load_dotenv("/home/jupyteach-msda/jupyteach-ai/.env")

class JupyteachQuestionChain(LLMChain):
    def create_outputs(self, llm_result: LLMResult) -> List[Dict[str, Any]]:
        out = super().create_outputs(llm_result)
        return [
            {**d, "original_text_response": g[0].text}
             for (d, g) in zip(out, llm_result.generations)
        ]
def build_llm_for_pydantic_model(model_class):
    parser = PydanticOutputParser(pydantic_object = model_class)
    system = SystemMessagePromptTemplate.from_template(system_prompt_1)
    human = HumanMessagePromptTemplate.from_template("{input}")
    
    prompt = ChatPromptTemplate(
        messages = [system, MessagesPlaceholder(variable_name = "history"), human],
        partial_variables = {"format_instructions": parser.get_format_instructions()},
        # output_parser=parser,
    )
    model = ChatOpenAI(temperature = 0)
    
    memory = ConversationBufferMemory(
        memory_key = "history", 
        return_messages = True,
        output_key = "original_text_response",
    )
    return JupyteachQuestionChain(
        memory = memory,
        llm = model,
        prompt = prompt,
        output_parser = parser,
        output_key = "question",
        return_final_only = False,
    )

In [10]:
ss_chain = build_llm_for_pydantic_model(SingleSelection)
q_ss = ss_chain.invoke(input = "topic: pandas dataframes\ndifficulty: 3")
q_ss["question"]

Given a DataFrame `df` with columns 'A', 'B', and 'C', how can you drop all rows where the value in column 'A' is greater than 10?

- [ ] `df = df[df['A'] <= 10]`
- [x] `df = df.drop(df[df['A'] > 10].index)`
- [ ] `df = df[df['A'] > 10].drop()`
- [ ] `df = df.drop(df[df['A'] > 10])`


In [11]:
ms_chain = build_llm_for_pydantic_model(MultipleSelection)
q_ms = ms_chain.invoke(input = "topic: scikit-learn\ndifficulty: 2")
q_ms["question"]

What is the purpose of the `fit` method in scikit-learn?

- [x] To train the model using the input data
- [ ] To evaluate the performance of the model
- [ ] To make predictions using the trained model


In [12]:
code_chain = build_llm_for_pydantic_model(Code)
q_code = code_chain.invoke(input = "topic: seaborn visualizations\ndifficulty: 2")
q_code["question"]

Create a scatter plot using Seaborn to visualize the relationship between two variables `x` and `y` from a given DataFrame `df`. Set the color of the markers to blue and add a title to the plot as 'Scatter Plot'.

```python
import seaborn as sns
import matplotlib.pyplot as plt

# Given DataFrame
# df = ...

# Scatter plot
sns.scatterplot(data=df, x=..., y=..., color=...)
plt.title(...)
plt.show()
```

**Solution**

```python
import seaborn as sns
import matplotlib.pyplot as plt

# Given DataFrame
# df = ...

# Scatter plot
sns.scatterplot(data=df, x='x', y='y', color='blue')
plt.title('Scatter Plot')
plt.show()
```

**Test Suite**

```python
import seaborn as sns
import matplotlib.pyplot as plt

import seaborn as sns
import matplotlib.pyplot as plt

# Given DataFrame
# df = ...

# Scatter plot
sns.scatterplot(data=df, x='x', y='y', color='blue')
plt.title('Scatter Plot')
plt.show()

assert 'scatterplot' in globals()
assert 'plt' in globals()
assert plt.gca().has_data()
```

In [13]:
fib_chain = build_llm_for_pydantic_model(FillInBlank)
q_fib = fib_chain.invoke(input = "topic: reshaping data in pandas\ndifficulty: 2")
q_fib["question"]

Suppose you have a DataFrame `df` with the following structure:

|   | Name   | Subject | Score |
|---|--------|---------|-------|
| 0 | Alice  | Math    | 90    |
| 1 | Alice  | Science | 85    |
| 2 | Bob    | Math    | 95    |
| 3 | Bob    | Science | 92    |

You want to reshape the data so that each unique Name becomes a column, and the corresponding Scores are the values in the new columns. Fill in the blanks below to achieve this:


```python
___X(df, index='Subject', columns='Name', values='Score')
```

**Solution**

[pivot_table]
```

**Rendered Solution**

```python
pivot_table(df, index='Subject', columns='Name', values='Score')
```

**Test Suite**

```python
import pandas as pd

data = {'Name': ['Alice', 'Alice', 'Bob', 'Bob'],
        'Subject': ['Math', 'Science', 'Math', 'Science'],
        'Score': [90, 85, 95, 92]}

df = pd.DataFrame(data)

pivot_table(df, index='Subject', columns='Name', values='Score')

expected_columns = ['Alice', 'Bob']
expected_index = ['Math', 'Science']

assert df_pivot.columns.tolist() == expected_columns
assert df_pivot.index.tolist() == expected_index
assert df_pivot.loc['Math', 'Alice'] == 90
assert df_pivot.loc['Science', 'Bob'] == 92
```

## Prompt 2

In [None]:
## Create System Prompt 2
system_prompt_2 = """For each message you will be given two inputs

topic: string
difficulty: integer

Your task is to produce questions that professors can utilize for testing and examination purposes.

The difficulty will be a number between 1 and 3, with 1 corresponding to a request for an easy question, and 3 for the most difficult question.

If the user asks you for another question and does not specify either a new topic or a new difficulty, you must use the previous topic or difficulty.

Your responses must always exactly match the specified format with no extra words or content.

{format_instructions}
"""

In [17]:
import dotenv
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores.pgvector import PGVector
from langchain.embeddings.openai import OpenAIEmbeddings

dotenv.load_dotenv("/home/jupyteach-msda/jupyteach-ai/.env")

# Load Vector Store
dotenv.load_dotenv()
DB_CONNECTION = "postgresql://postgres:supa-jupyteach@192.168.0.77:54328/postgres"
COLLECTION_NAME = "documents"

def get_vectorstore():
    embeddings = OpenAIEmbeddings()
    db = PGVector(embedding_function = embeddings,
                  collection_name = COLLECTION_NAME,
                  connection_string = DB_CONNECTION)
    return db

# Initialize Retriever
db = get_vectorstore()
retriever = db.as_retriever()

# Update create_chain Function
def create_chain(system_message_text):
    # Step 1: Create LLM
    llm = ChatOpenAI(temperature = 0)

    # Step 2: Create Retriever Tool
    tool = create_retriever_tool(
        retriever,
        "search_course_content",
        "Searches and returns documents regarding the contents of the course and notes from the instructor.",
    )
    tools = [tool]

    # Step 3: Create System Message from the Text Passed in as an Argument
    system_message = SystemMessage(content = system_message_text)

    # Return the Chain
    return create_conversational_retrieval_agent(
        llm = llm, 
        tools = tools, 
        verbose = False, 
        system_message = system_message
    )

In [26]:
class JupyteachQuestionChain(ConversationalRetrievalChain):
    def create_outputs(self, llm_result: LLMResult) -> List[Dict[str, Any]]:
        # As ConversationalRetrievalChain might handle outputs differently,
        # ensure that this call to super() aligns with its implementation
        out = super().create_outputs(llm_result)

        # The following assumes that the structure of outputs from
        # ConversationalRetrievalChain is similar to that from LLMChain
        return [
            {**d, "original_text_response": g[0].text}
            for (d, g) in zip(out, llm_result.generations)
        ]
# Build LLM for Pydantic Model
def build_llm_for_pydantic_model(model_class):
    parser = PydanticOutputParser(pydantic_object=model_class)
    system = SystemMessagePromptTemplate.from_template(system_prompt_1)
    human = HumanMessagePromptTemplate.from_template("{input}")
    
    prompt = ChatPromptTemplate(
        messages=[system, MessagesPlaceholder(variable_name="history"), human],
        partial_variables={"format_instructions": parser.get_format_instructions()},
        # output_parser=parser,
    )
    tool = create_retriever_tool(
        retriever,
        "search_course_content",
        "Searches and returns documents regarding the contents of the course and notes from the instructor.",
    )
    tools = [tool]
    model = ChatOpenAI(temperature=0)
    
    memory = ConversationBufferMemory(
        memory_key="history", 
        return_messages=True,
        output_key="original_text_response",
    )

    # Adjust this part to incorporate any additional features or configurations
    # specific to ConversationalRetrievalChain
    return JupyteachQuestionChain(
        memory=memory,
        #llm=model,
        combine_docs_chain=,
        question_generator=,
        retriever=tools,
        #prompt=prompt,
        #output_parser=parser,
        output_key="question",
        #return_final_only=False,
        # Include any other necessary configurations or parameters specific to ConversationalRetrievalChain
    )

In [27]:
ss_chain = build_llm_for_pydantic_model(SingleSelection)
q_ss = ss_chain.invoke(input = "topic: pandas dataframes\ndifficulty: 3")
q_ss["question"]

ValidationError: 7 validation errors for JupyteachQuestionChain
combine_docs_chain
  field required (type=value_error.missing)
question_generator
  field required (type=value_error.missing)
retriever
  field required (type=value_error.missing)
llm
  extra fields not permitted (type=value_error.extra)
output_parser
  extra fields not permitted (type=value_error.extra)
prompt
  extra fields not permitted (type=value_error.extra)
return_final_only
  extra fields not permitted (type=value_error.extra)

In [31]:
from langchain.memory import ConversationBufferMemory
from langchain.llms import OpenAI
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [32]:
db = get_vectorstore()

In [33]:
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), db.as_retriever(), memory=memory)