# Tabular Data Interrogation Investigation

In [None]:
%load_ext dotenv
%dotenv ../.env

In [None]:
import pandas as pd
import os

In [None]:
# ENVIRONMENT VARIABLES - Update the below as required
file_path = "../data/titanic.csv"
DOWNLOAD_URL = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
DB_LOC = "titanic.db"
TABLE_NAME = "titanic"

In [None]:
if not os.path.exists(file_path):
    import subprocess
    # If the file does not exist, it tries to download it from the URL via curl
    try:
        subprocess.run(["curl", "-o", file_path, DOWNLOAD_URL], check=True)
        print(f"Downloaded file {os.path.basename(file_path)} successfully")
    except Exception as e:
        print(f"Error downloading file {e}")



In [None]:
def ingest_file(file_path: str):
    try:
        if file_path.endswith(".csv"):
            return pd.read_csv(file_path)
        elif file_path.endswith(".xls") or file_path.endswith(".xlsx"):
            return pd.read_excel(file_path) #TODO Modify to handle multiple sheets
        else:
            raise TypeError("Only csvs and excel files are accepted")
    except Exception as e:
        raise e

df = ingest_file(file_path)

In [None]:
from langchain_community.utilities import SQLDatabase
from sqlalchemy import create_engine, inspect


engine = create_engine(f"sqlite:///{DB_LOC}")
inspector = inspect(engine)

if TABLE_NAME not in inspector.get_table_names():
    df.to_sql(TABLE_NAME, engine, index=False)

In [None]:
db = SQLDatabase(engine=engine)
print(db.dialect)
print(db.get_usable_table_names())
print(db.run("SELECT * FROM titanic WHERE Age > 25 AND Pclass=1;"))

In [None]:
from redbox.models.settings import Settings
from langchain.chat_models import init_chat_model

env = Settings()
model = llm=  init_chat_model(
        model="anthropic.claude-3-sonnet-20240229-v1:0",
        model_provider="bedrock"
    )

In [None]:
from langchain_community.agent_toolkits import create_sql_agent
from langchain.agents.agent_toolkits import SQLDatabaseToolkit

toolkit = SQLDatabaseToolkit(db=db, llm=model)

agent_executor = create_sql_agent(model, toolkit=toolkit, verbose=True)

In [None]:
# agent_executor.run( "How many women survived?")

In [None]:
# Example query
agent_executor.invoke({"input", "What proportion of survivors were in class 1?"})

In [None]:
sample_query="How many of the non-survivors were in their thirtees?"
response = agent_executor({"input", sample_query})["output"]
print(response)