In [1]:
import os
from pathlib import Path
import dotenv
os.chdir("../../")
Path.cwd()
dotenv.load_dotenv()

True

In [5]:
import math
import os
import json
from typing import Optional, Type

# Starndard Langchain example
# from langchain.agents import create_spark_sql_agent
# from langchain.agents.agent_toolkits import SparkSQLToolkit
# from langchain.chat_models import ChatOpenAI

# Import things that are needed generically
from langchain.pydantic_v1 import BaseModel, Field
from langchain.tools import BaseTool
from langchain.tools.file_management.read import ReadFileTool
from langchain.utilities.spark_sql import SparkSQL
# from pyspark.sql import SparkSession

import autogen

config_list = autogen.config_list_from_json(
    "conf/OAI_CONFIG_LIST.txt",
    filter_dict={
        "model": ["gemini-pro"],  # , "gpt-4", "gpt-3.5-turbo", "gpt-3.5-turbo-16k"
    },
)

  from .autonotebook import tqdm as notebook_tqdm


## Making Function Calls


In [17]:
from langchain_community.document_loaders import PlaywrightURLLoader
from langchain_community.document_loaders import WebBaseLoader

from langchain_community.tools import BraveSearch

from typing import *

In [18]:
from langchain_google_genai import ChatGoogleGenerativeAI

from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain_text_splitters import CharacterTextSplitter

from langchain.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate

# !pip install langchainhub -qqq
from langchain import hub

In [9]:
llm = ChatGoogleGenerativeAI(model="gemini-pro")
# Map
map_prompt = hub.pull("rlm/map-prompt")
map_chain = LLMChain(llm=llm, prompt=map_prompt)
# reduce
reduce_prompt = hub.pull("rlm/reduce-prompt")
reduce_prompt.messages[-1].prompt.template = """
The following is set of summaries:
{doc_summaries}
Take these and distill it into a comprehensive and detailed summary.
Helpful Answer:
"""

reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, 
    document_variable_name="doc_summaries"
)
# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=int(4e3),
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=map_chain,
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
# result = map_reduce_chain.invoke(
#   text_splitter.split_documents(docs)
# )

  warn_deprecated(


In [19]:
async def load_urls_playwright(urls: List[AnyStr]=[]):
  loader = PlaywrightURLLoader(
    urls=urls, remove_selectors=["header", "footer"])
  return await loader.aload()

# docs = await load_urls_playwright()


async def advanced_url_extractor(query: str, search_params = {"count": 2}):
  search_tool = BraveSearch.from_api_key(
    api_key=os.getenv("BRAVE_API_KEY"), 
    search_kwargs=search_params
  )
  search_urls = json.loads(search_tool.run(query))
  urls = [l.get("link") for l in search_urls]
  return await load_urls_playwright(urls)

docs = await advanced_url_extractor("Genghis Khan")

In [24]:
from langchain_core.documents.base import Document

def generate_llm_config(tool):
    # Define the function schema based on the tool's args_schema
    function_schema = {
        "name": tool.name.lower().replace(" ", "_"),
        "description": tool.description,
        "parameters": {
            "type": "object",
            "properties": {},
            "required": [],
        },
    }
    if tool.args is not None:
        function_schema["parameters"]["properties"] = tool.args
    return function_schema

class WebSearchInput(BaseModel):
    web_query: str = Field()

class AdvancedWebLookupTool(BaseTool):
    name = "web_search_engine"
    description = "Use this tool when you need access to current or additional data, news from the internet"
    args_schema: Type[BaseModel] = WebSearchInput
    docs: List[Document] = None

    async def _run(self, web_query: str) -> float:
        if self.docs is None:
            self.docs = await advanced_url_extractor(web_query)
        return map_reduce_chain.invoke(text_splitter.split_documents(self.docs)).get("output_text")


In [25]:
tool=AdvancedWebLookupTool()
summary = await tool.run("Identity of Genghis Khan")
print(summary)

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 8.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<loc

**Genghis Khan and the Mongol Empire**

**Early Life and Rise to Power**

Temüjin, later known as Genghis Khan, was born into a nomadic Mongol tribe in the late 12th century. His early life was marked by poverty and hardship, including the loss of his father to enemies and the abduction of his wife, Börte. Despite these challenges, Temüjin displayed remarkable leadership qualities, attracting followers and forming strategic alliances.

Through his charisma and military prowess, Temüjin gradually rose to prominence, uniting various Mongol tribes under his banner. He formed a powerful alliance with Toghril, Khan of the Kereit tribe, and gained the support of his childhood friend, Jamuka. Together, they launched successful campaigns against rival tribes, expanding their territory and consolidating their power.

**Unification of the Mongol Nation**

In 1206, Temüjin convened a kurultai, a traditional Mongol assembly, where he was proclaimed Genghis Khan, the "Universal Ruler." This marked 

In [52]:
# from langchain.chains.summarize import load_summarize_chain
# from langchain_community.document_loaders import WebBaseLoader
# from langchain_openai import ChatOpenAI


# Instantiate the ReadFileTool
read_file_tool = ReadFileTool()
web_tool = AdvancedWebLookupTool()

# Construct the llm_config
llm_config = {
    # Generate functions config for the Tool
    "functions": [
        generate_llm_config(web_tool),
        # generate_llm_config(read_file_tool),
    ],
    "config_list": config_list,  # Assuming you have this defined elsewhere
    "timeout": 120,
}

In [57]:
# Define a function to generate llm_config from a LangChain tool
def generate_llm_config(tool):
    # Define the function schema based on the tool's args_schema
    function_schema = {
        "name": tool.name.lower().replace(" ", "_"),
        "description": tool.description,
        "parameters": {
            "type": "object",
            "properties": {},
            "required": [],
        },
    }

    if tool.args is not None:
        function_schema["parameters"]["properties"] = tool.args

    return function_schema


user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    code_execution_config={
        "work_dir": "coding",
        "use_docker": False,
    },  # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.
)

# Register the tool and start the conversation
user_proxy.register_function(
    function_map={
        web_tool.name: web_tool._run,
        # read_file_tool.name: read_file_tool._run,
    }
)

chatbot = autogen.AssistantAgent(
    name="chatbot",
    system_message="For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.",
    llm_config=llm_config,
)

user_proxy.initiate_chat(
    chatbot,
    message=f"You are Genghis Khan, provide a graudation ceremony monologue to college students",  # 7.81mm in the file
    llm_config=llm_config,
)

[33muser_proxy[0m (to chatbot):

A graudation ceremony monologue script that Genghis Khan would give to college students

--------------------------------------------------------------------------------


[33mchatbot[0m (to user_proxy):

**Genghis Khan:**

My valiant warriors of knowledge,

From the vast steppes of Central Asia, I, Genghis Khan, have come to this hallowed hall to impart wisdom upon you.

Like the thunderous hooves of my Mongol hordes, you have conquered the relentless pursuit of education. Today, you stand on the precipice of a new era, armed with the sharp weapons of intellect and the burning desire to shape the world.

Know this: the path you have chosen is not for the faint of heart. It is a path fraught with challenges and trials. But like the resilient Mongols, you must embrace adversity with unwavering determination.

Remember, true conquest lies not only in subjugating your enemies but also in conquering yourselves. The greatest battles you will fight are not on physical battlefields, but within the confines of your own minds.

Forge your spirits in the fires of knowledge. Seek out new ideas and perspectives, even if they challenge your own. Question assumption

RuntimeError: Google GenAI exception occurred while calling Gemini API: index: 0
finish_reason: SAFETY
safety_ratings {
  category: HARM_CATEGORY_SEXUALLY_EXPLICIT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HATE_SPEECH
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_HARASSMENT
  probability: NEGLIGIBLE
}
safety_ratings {
  category: HARM_CATEGORY_DANGEROUS_CONTENT
  probability: HIGH
}


# A PySpark Example

In [None]:
%pip install pyspark



In [None]:
spark = SparkSession.builder.getOrCreate()
schema = "langchain_example"
spark.sql(f"CREATE DATABASE IF NOT EXISTS {schema}")
spark.sql(f"USE {schema}")
csv_file_path = "./sample_data/california_housing_train.csv"
table = "california_housing_train"

spark.read.csv(csv_file_path, header=True, inferSchema=True).write.option(
    "path", "file:/content/spark-warehouse/langchain_example.db/california_housing_train"
).mode("overwrite").saveAsTable(table)
spark.table(table).show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -114.31|   34.19|              15.0|     5612.0|        1283.0|    1015.0|     472.0|       1.4936|           66900.0|
|  -114.47|    34.4|              19.0|     7650.0|        1901.0|    1129.0|     463.0|         1.82|           80100.0|
|  -114.56|   33.69|              17.0|      720.0|         174.0|     333.0|     117.0|       1.6509|           85700.0|
|  -114.57|   33.64|              14.0|     1501.0|         337.0|     515.0|     226.0|       3.1917|           73400.0|
|  -114.57|   33.57|              20.0|     1454.0|         326.0|     624.0|     262.0|        1.925|           65500.0|
|  -114.58|   33.63|    

In [None]:
# Note, you can also connect to Spark via Spark connect. For example:
# db = SparkSQL.from_uri("sc://localhost:15002", schema=schema)
spark_sql = SparkSQL(schema=schema)
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k")
toolkit = SparkSQLToolkit(db=spark_sql, llm=llm)
agent_executor = create_spark_sql_agent(llm=llm, toolkit=toolkit, verbose=True)

In [None]:
# Starndard Langchain example
agent_executor.run("Describe the california_housing_train table")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: list_tables_sql_db
Action Input: ""[0m
Observation: [38;5;200m[1;3mcalifornia_housing_train[0m
Thought:[32;1m[1;3mI see that there is a table called "california_housing_train" in the database. I can use the "schema_sql_db" tool to get more information about this table.
Action: schema_sql_db
Action Input: "california_housing_train"[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.langchain_example.california_housing_train (
  longitude DOUBLE,
  latitude DOUBLE,
  housing_median_age DOUBLE,
  total_rooms DOUBLE,
  total_bedrooms DOUBLE,
  population DOUBLE,
  households DOUBLE,
  median_income DOUBLE,
  median_house_value DOUBLE)
;

/*
3 rows from california_housing_train table:
longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
-114.31	34.19	15.0	5612.0	1283.0	1015.0	472.0	1.4936	66900.0
-114.47	34.4	19.0	7650.0	1901.0	1129.0	463.0	1.82

'The "california_housing_train" table has the following columns: longitude, latitude, housing_median_age, total_rooms, total_bedrooms, population, households, median_income, and median_house_value.'

In [None]:
# LangChain direct tool usage instead of toolkit example
# from langchain.tools.spark_sql.tool import (
#     InfoSparkSQLTool,
#     ListSparkSQLTool,
#     QueryCheckerTool,
#     QuerySparkSQLTool,
# )
# debug_toolkit = [
#   QuerySparkSQLTool(db=spark_sql),
#   InfoSparkSQLTool(db=spark_sql),
#   ListSparkSQLTool(db=spark_sql),
#   QueryCheckerTool(db=spark_sql, llm=llm),
# ]

In [None]:
# Now use AutoGen with Langchain Tool Bridgre
tools = []
function_map = {}

for tool in toolkit.get_tools():  # debug_toolkit if you want to use tools directly
    tool_schema = generate_llm_config(tool)
    print(tool_schema)
    tools.append(tool_schema)
    function_map[tool.name] = tool._run

# Construct the llm_config
llm_config = {
    "functions": tools,
    "config_list": config_list,  # Assuming you have this defined elsewhere
    "timeout": 120,
}

user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    code_execution_config={
        "work_dir": "coding",
        "use_docker": False,
    },  # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.
)

print(function_map)

# Register the tool and start the conversation
user_proxy.register_function(function_map=function_map)

chatbot = autogen.AssistantAgent(
    name="chatbot",
    system_message="For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.",
    llm_config=llm_config,
)

user_proxy.initiate_chat(
    chatbot,
    message="Describe the table names california_housing_train",
    llm_config=llm_config,
)

{'name': 'query_sql_db', 'description': '\n    Input to this tool is a detailed and correct SQL query, output is a result from the Spark SQL.\n    If the query is not correct, an error message will be returned.\n    If an error is returned, rewrite the query, check the query, and try again.\n    ', 'parameters': {'type': 'object', 'properties': {'query': {'title': 'Query', 'type': 'string'}}, 'required': []}}
{'name': 'schema_sql_db', 'description': '\n    Input to this tool is a comma-separated list of tables, output is the schema and sample rows for those tables.\n    Be sure that the tables actually exist by calling list_tables_sql_db first!\n\n    Example Input: "table1, table2, table3"\n    ', 'parameters': {'type': 'object', 'properties': {'table_names': {'title': 'Table Names', 'type': 'string'}}, 'required': []}}
{'name': 'list_tables_sql_db', 'description': 'Input is an empty string, output is a comma separated list of tables in the Spark SQL.', 'parameters': {'type': 'object'