In [1]:
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector

load_dotenv()

COLLECTION_NAME = "documents"
DB_CONNECTION = "postgresql://postgres:supa-jupyteach@192.168.0.77:54328/postgres"

def get_vectorstore():
    embeddings = OpenAIEmbeddings()

    db = PGVector(embedding_function=embeddings,
        collection_name=COLLECTION_NAME,
        connection_string=DB_CONNECTION,
    )
    return db

db = get_vectorstore()
retriever = db.as_retriever()

In [2]:
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.agents.agent_toolkits import create_retriever_tool

In [3]:
from langchain.schema.messages import SystemMessage

def create_chain(system_message_text):
    # step 1: create llm
    from langchain.chat_models import ChatOpenAI
    llm = ChatOpenAI(temperature=0)
    
    # step 2: create retriever tool
    tool = create_retriever_tool(
        retriever,
        "search_course_content",
        "Searches and returns documents regarding the contents of the course and notes from the instructor.",
    )
    tools = [tool]

    # step 3: create system message from the text passed in as an argument
    system_message = SystemMessage(content=system_message_text)

    # return the chain
    return create_conversational_retrieval_agent(
        llm=llm, 
        tools=tools, 
        verbose=False,  # set to False to clean up output 
        system_message=system_message
    )

In [4]:
query1 = "Hi, This is Yassin"
query2 = "Last week we learned about the methods of BLS API and exploration, Could give a summary? "
query3 = "How can BLS be explored?"
query4 = "What type of data can I access using the BLS api?"

bls_queries = [query1, query2, query3, query4]

#query5 = "What Python libraries do need to import to access the API?"
#query6 = "In an inverse problem, What modeling decisions Python can solve a statistical situation?"
#query7 = "We know that SQL is databases, data processing programming language. How SQL can communicate with databases?"
#query8 = "Can you retrieve relevant information from the database to answer this question: What are data storage formats with Pandas?"
#query9 = "Can you tell the differences of the storage formats betwwen JSON and SQL? Tell moe the advantages and disadvantages for each, and which one is more beneficial or useful."
#query10 = "When it comes to time-series, Can you define the concept of Aliases? How does Aliases in frequency arguments? Give me an example using Python that represents each alias applied as Dateoffset. Can this examople be used in MATLAB?"


stats_queries = [
    "In an inverse problem, What modeling decisions Python can solve a statistical situation?",
    "what about the direct problem, what is that?",
    "How can I use numpy to solve the direct problem for a given statistical model?",
    "Can you give me a seaborn example to visualize statistical anlysis?",
    "In an inverse problem, What modeling decisions Python can solve a statistical situation?"
]

sql_queries = [
    "What does SQL stand for?",
    "How des SQL communicate with a database?",
    "Can I use pandas with SQL?",
    "Can you provide me an example using SQL and Python?",
    "What database SQL libraries that can manipulated with pandas?",
    "What are data storage formats with Pandas?"
]

aliases_queries = [
    "Can you define the concept of Aliases?",
    "How does Aliases in frequency arguments work?",
    "Can you provide an example using Python that represents each alias applied as Dateoffset?",
    "Can this example be used in MATLAb?"
]

web_queries = [
    "What is Web-scraping?",
    "What does HTTP stands for?",
    "What elements Python can develop data through HTTP?",
    "Can we use CSS while working with Python?",
    "Can we scape websites using JSON?"
]

markov_queries = [
    "What is a Markov chain?",
    "How is the Markov property defined in the context of Markov chains?",
    "What is the state space of a Markov chain?",
    "Explain the transition probability matrix in a Markov chain."
    "How are transition probabilities between states represented in a Markov chain?",
    "Can you explain the meaning of the entries in a transition probability matrix?",
    "What is the significance of the sum of transition probabilities for each state in a row?",
    "What is a stationary distribution in the context of Markov chains?",
    "Under what conditions does a Markov chain have a unique stationary distribution?"
]   

webscraping_queries = [
    "What is web scraping, and explain the basic steps involved in a typical web scraping process?",
    "How does the structure of HTML contribute to web scraping?",
    "What role does CSS play in web scraping, especially when selecting elements?",
    "What are some popular libraries for web scraping in Python?",
    "What are common methods for storing scraped data?",
    "Discuss best practices for organizing and managing large datasets obtained through web scraping."
]

datastorage_reshaping_queries = [
    "Compare and contrast CSV and JSON as data storage formats.",
    "In what scenarios would you prefer to use CSV over JSON, and vice versa?",
    "Discuss any limitations or advantages of each format in terms of readability, size, and compatibility.",
    "How can you store pandas DataFrames in a relational database?",
    "Discuss the advantages and disadvantages of using databases (e.g., SQLite, MySQL) for storing structured data compared to file-based formats.",
    "What considerations should be taken into account when choosing between a database and a file format for data storage?",
    "What is the Parquet file format, and how does it differ from other file formats like CSV or JSON?",
    "Discuss the benefits of using Parquet for storing large datasets.",
    "How can you read and write Parquet files in pandas?",
    "What is data serialization, and how is it relevant to data storage?",
    "What is the purpose of the melt function in pandas?",
    "Provide an example of melting a DataFrame and explain when this operation is useful.",
    "How does melting help in converting wide-format data to long-format data?",
    "Explain the concept of pivoting in pandas.",
    "Provide examples of using the pivot and pivot_table functions to reshape data."
]   
queries = [bls_queries, stats_queries, sql_queries, aliases_queries, web_queries,markov_queries,webscraping_queries,datastorage_reshaping_queries]

In [9]:
def report_on_message(msg):
    print("any intermediate_steps?: ", len(msg["intermediate_steps"]) > 0)
    print("output:\n", msg["output"])
    print("\n\n")


def chat_and_report(chat_conv, query):
    msg = chat_conv({"input": query})
    report_on_message(msg)
    return msg

def evaluate_prompt_bls(prompt, bls_queries=bls_queries, **kw):
    chat_conv = create_chain(prompt, **kw)
    out = []
    for i, q in enumerate(bls_queries):
        print(f"********** Query {i+1}\n")  
        print(f"input: {q}")
        out.append(chat_and_report(chat_conv, q))
    return out

def evaluate_prompt_stats(prompt, stats_queries=stats_queries, **kw):
    chat_conv = create_chain(prompt, **kw)
    out = []
    for i, q in enumerate(stats_queries):
        print(f"********** Query {i+1}\n")  
        print(f"input: {q}")
        out.append(chat_and_report(chat_conv, q))
    return out

def evaluate_prompt_sql(prompt, sql_queries=sql_queries, **kw):
    chat_conv = create_chain(prompt, **kw)
    out = []
    for i, q in enumerate(sql_queries):
        print(f"********** Query {i+1}\n")  
        print(f"input: {q}")
        out.append(chat_and_report(chat_conv, q))
    return out

def evaluate_prompt_aliases(prompt, aliases_queries=aliases_queries, **kw):
    chat_conv = create_chain(prompt, **kw)
    out = []
    for i, q in enumerate(aliases_queries):
        print(f"********** Query {i+1}\n")  
        print(f"input: {q}")
        out.append(chat_and_report(chat_conv, q))
    return out

def evaluate_prompt_web(prompt, web_queries=web_queries, **kw):
    chat_conv = create_chain(prompt, **kw)
    out = []
    for i, q in enumerate(web_queries):
        print(f"********** Query {i+1}\n")  
        print(f"input: {q}")
        out.append(chat_and_report(chat_conv, q))
    return out

def evaluate_prompt_markov(prompt, markov_queries=markov_queries, **kw):
    chat_conv = create_chain(prompt, **kw)
    out = []
    for i, q in enumerate(markov_queries):
        print(f"********** Query {i+1}\n")  
        print(f"input: {q}")
        out.append(chat_and_report(chat_conv, q))
    return out

def evaluate_prompt_webscraping_queries(prompt, webscraping_queries=webscraping_queries, **kw):
    chat_conv = create_chain(prompt, **kw)
    out = []
    for i, q in enumerate(webscraping_queries):
        print(f"********** Query {i+1}\n")  
        print(f"input: {q}")
        out.append(chat_and_report(chat_conv, q))
    return out
    
def evaluate_prompt_datastorage_reshaping_queries(prompt, datastorage_reshaping_queries= datastorage_reshaping_queries, **kw):
    chat_conv = create_chain(prompt, **kw)
    out = []
    for i, q in enumerate(datastorage_reshaping_queries):
        print(f"********** Query {i+1}\n")  
        print(f"input: {q}")
        out.append(chat_and_report(chat_conv, q))
    return out

In [10]:
final_prompt_1 = """\
You are a helpful, knowledgeable, and smart teaching assistant.

You specialize in helping students understand concepts their instructors teach by:

1. Decribe concepts and formulas as simple as you can in a real-world scenario.
2. Providing additional examples of the topics being discussed
3. Summarizing content from the instructor, which will be provided to you along with the student's question
4. Reply back with more guidance with concise responses and give a step by step explainations following
the key points that have been covered with more details.
5. Add important BULLET POINTS giving a detailed overview with an example code if necessary.
6. Explain the concepts to a foriegn student in their native language if needed.  
7. Create a sophisticated, humor joke about the concepts and formulas.


Feel free to use any tools available to look up relevant information, and show references only if necessary
"""
message_bls = evaluate_prompt_bls(final_prompt_1)
message_stats = evaluate_prompt_stats(final_prompt_1)
message_sql = evaluate_prompt_sql(final_prompt_1)
message_aliases = evaluate_prompt_aliases(final_prompt_1)
message_web = evaluate_prompt_web(final_prompt_1)
message_markov = evaluate_prompt_markov(final_prompt_1)
message_webscrapping =  evaluate_prompt_webscraping_queries(final_prompt_1)
message_datastorage_reshaping = evaluate_prompt_datastorage_reshaping_queries(final_prompt_1)

********** Query 1

input: Hi, This is Yassin
any intermediate_steps?:  False
output:
 Hello Yassin! How can I assist you today?



********** Query 2

input: Last week we learned about the methods of BLS API and exploration, Could give a summary? 
any intermediate_steps?:  True
output:
 Sure! Here is a summary of the methods of the BLS API and exploration:

- To access data from the BLS API, we need to identify the necessary query parameters that should be sent to specify our request.
- We may also have a payload that provides additional information about the specifics of our request.
- Headers can provide context about who we are and why we're making the request.
- Authentication is done using an API key to authenticate ourselves.

To explore the BLS API, we can follow these steps:

1. Go to the BLS API documentation page.
2. Find the section that describes the different endpoints and documentation for the API.
3. Identify the endpoint that allows us to access the desired data.
4. Pa