# 1. 🔧 Setup: Libraries, API Key, and Imports

In [1]:
# Core Libraries
import pandas as pd
import sqlite3
import io
import warnings

In [2]:
# Widgets for file upload
from IPython.display import display
import ipywidgets as widgets

In [3]:
# Gemini API Setup
from google import genai
from google.genai import types
%pip install google.api_core
from google.api_core import retry
from google.api_core.exceptions import GoogleAPIError
from google.generativeai.types import GenerationConfig


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
# 🔧 Setup: Load API Key + Gemini Configuration
import os
from dotenv import load_dotenv
%pip install google.generativeai
import google.generativeai as genai

# Load your .env file
load_dotenv()

# Grab the key
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Configure Gemini
genai.configure(api_key=GOOGLE_API_KEY)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
from google.api_core import retry

# Retry handler for transient API errors
is_retriable = lambda e: (
    isinstance(e, genai.errors.APIError) and e.code in {429, 503}
)

genai.GenerativeModel.generate_content = retry.Retry(
    predicate=is_retriable
)(genai.GenerativeModel.generate_content)


# 2. 📂 Load a Dataset (Kaggle OR Upload your own CSV)

## Loading kaggle data

In [None]:
# Data Set: 
data = pd.read_csv('amazon_sales_data 2025.csv')
data

## Loading input data

> 👉 **Tip**: You can upload your own `.csv` file using the widget below. If you skip this step, the notebook will use a default Amazon Sales dataset.


In [6]:
# Upload CSV widget
upload = widgets.FileUpload(accept='.csv', multiple=False)
display(upload)

# Ask user if they want to upload a CSV
use_upload = input("📤 Do you want to upload your own CSV file? (y/n): ").lower()

if use_upload == 'y':
    # Show the upload widget
    upload = widgets.FileUpload(accept='.csv', multiple=False)
    display(upload)
    print("📤 Please upload your file using the widget above, then run the next cell.")
else:
    print("📂 Skipping upload. We'll use the default dataset instead.")


FileUpload(value=(), accept='.csv', description='Upload')

FileUpload(value=(), accept='.csv', description='Upload')

📤 Please upload your file using the widget above, then run the next cell.


In [7]:
def handle_upload():
    if 'upload' in globals() and upload.value:
        for file_info in upload.value:
            content = file_info['content']  # Access the content attribute of the file info
            df = pd.read_csv(io.BytesIO(content))
            df.columns = [col.strip().replace(" ", "_") for col in df.columns]
            return df
    return None  # No file uploaded

df = handle_upload()

if df is None:
    print("⚠️ No user file uploaded — using default Kaggle dataset instead.")
    df = pd.read_csv('amazon_sales_data 2025.csv')
    df.columns = [col.strip().replace(" ", "_") for col in df.columns]

df.head()


  df = pd.read_csv(io.BytesIO(content))


Unnamed: 0,Loan_ID,Customer_ID,Loan_Status,Current_Loan_Amount,Term,Credit_Score,Years_in_current_job,Home_Ownership,Annual_Income,Purpose,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,Bankruptcies,Tax_Liens
0,6cf51492-02a2-423e-b93d-676f05b9ad53,7c202b37-2add-44e8-9aea-d5b119aea935,Charged Off,12232,Short Term,7280.0,< 1 year,Rent,46643.0,Debt Consolidation,777.39,18.0,10.0,12,0,6762,7946,0.0,0.0
1,552e7ade-4292-4354-9ff9-c48031697d72,e7217b0a-07ac-47dd-b379-577b5a35b7c6,Charged Off,25014,Long Term,7330.0,10+ years,Home Mortgage,81099.0,Debt Consolidation,892.09,26.7,,14,0,35706,77961,0.0,0.0
2,9b5e32b3-8d76-4801-afc8-d729d5a2e6b9,0a62fc41-16c8-40b5-92ff-9e4b763ce714,Charged Off,16117,Short Term,7240.0,9 years,Home Mortgage,60438.0,Home Improvements,1244.02,16.7,32.0,11,1,11275,14815,1.0,0.0
3,5419b7c7-ac11-4be2-a8a7-b131fb6d6dbe,30f36c59-5182-4482-8bbb-5b736849ae43,Charged Off,11716,Short Term,7400.0,3 years,Rent,34171.0,Debt Consolidation,990.94,10.0,,21,0,7009,43533,0.0,0.0
4,1450910f-9495-4fc9-afaf-9bdf4b9821df,70c26012-bba5-42c0-8dcb-75295ada31bb,Charged Off,9789,Long Term,6860.0,10+ years,Home Mortgage,47003.0,Home Improvements,503.71,16.7,25.0,13,1,16913,19553,1.0,0.0


# 3. 🧠 Ask AI: What Should We Explore?

## prompt for the kaggle data set

In [None]:
# Create the model
model = genai.GenerativeModel("gemini-2.0-flash")

## Prompt example
sample = data.head(5).to_markdown()
prompt = f"""
Here is a few rows of our dataset:

{sample}

Based on this dataset, what are some useful questions we should ask during further data analysis?
"""

# Config
short_config = GenerationConfig(max_output_tokens=200)

# Generate
response = model.generate_content(prompt, generation_config=short_config)
print(response.text)


## Prompt for the input data

In [10]:
# Create the model
model = genai.GenerativeModel("gemini-2.0-flash")
# Prompt:
sample_1 = df.head(5).to_markdown()

prompt_1 = f"""
Here is a few rows of our dataset:

{sample_1}

Based on this dataset, what are some useful questions we should ask during further data analysis?
"""

# Config
short_config = GenerationConfig(max_output_tokens=200)

# Generate
response = model.generate_content(prompt_1, generation_config=short_config)
print(response.text)


Okay, based on the sample data, here's a breakdown of potentially useful questions to explore in further data analysis, categorized for clarity:

**I. Loan Performance & Risk Factors:**

*   **Overall Loan Success:**
    *   What is the overall percentage of loans that are "Charged Off" vs. those with other statuses (assuming there are other statuses besides 'Charged Off')? This gives a baseline default rate.
    *   How does the loan status distribution vary across different years or time periods (if you have a loan origination date)? Are default rates increasing or decreasing?

*   **Credit Score Impact:**
    *   Is there a clear correlation between credit score and loan status? Are lower credit scores strongly associated with "Charged Off" loans?
    *   What is the distribution of credit scores for "Charged Off" vs. other loan statuses? Are there distinct clusters or patterns?
    *   What is the average credit score for each


# 4. 🧾 Generate & Run SQL Queries from Natural Language

In [12]:
# description function for both kaggle and user inputed csv:
def describe_table(conn, table_name: str):
    cursor = conn.cursor()
    cursor.execute(f"PRAGMA table_info({table_name});")
    return [(col[1], col[2]) for col in cursor.fetchall()]

In [11]:
# Query function for both kaggle and user inputed csv: 
def execute_query(conn, sql: str) -> list[list[str]]:
    print(f' - DB CALL: execute_query({sql})')
    cursor = conn.cursor()
    cursor.execute(sql)
    return cursor.fetchall()

## running SQL Queries for the kaggle data set

In [None]:
kaggle_conn = sqlite3.connect("sample.db")
data.columns = [col.strip().replace(" ", "_") for col in data.columns]
data.to_sql("data", kaggle_conn, if_exists="replace", index=False)

In [None]:
describe_table(kaggle_conn, "data")

In [None]:
execute_query(kaggle_conn, "select * from data where Category == 'Footwear'")

## Running SQL Queries from the input data

In [13]:
user_conn = sqlite3.connect("sample_1.db")
df.columns = [col.strip().replace(" ", "_") for col in df.columns]
df.to_sql("df", user_conn, if_exists="replace", index=False)

111107

In [14]:
describe_table(user_conn, "df")

[('Loan_ID', 'TEXT'),
 ('Customer_ID', 'TEXT'),
 ('Loan_Status', 'TEXT'),
 ('Current_Loan_Amount', 'INTEGER'),
 ('Term', 'TEXT'),
 ('Credit_Score', 'REAL'),
 ('Years_in_current_job', 'TEXT'),
 ('Home_Ownership', 'TEXT'),
 ('Annual_Income', 'REAL'),
 ('Purpose', 'TEXT'),
 ('Monthly_Debt', 'TEXT'),
 ('Years_of_Credit_History', 'REAL'),
 ('Months_since_last_delinquent', 'REAL'),
 ('Number_of_Open_Accounts', 'INTEGER'),
 ('Number_of_Credit_Problems', 'INTEGER'),
 ('Current_Credit_Balance', 'INTEGER'),
 ('Maximum_Open_Credit', 'TEXT'),
 ('Bankruptcies', 'REAL'),
 ('Tax_Liens', 'REAL')]

In [15]:
print("📂 Default dataset loaded into SQLite as 'data'")
print("📂 User-uploaded dataset loaded into SQLite as 'df'")

📂 Default dataset loaded into SQLite as 'data'
📂 User-uploaded dataset loaded into SQLite as 'df'


# 5. Agents

In [16]:
from tabulate import tabulate
import google.generativeai as genai

# Make sure you have your API key configured somewhere before this
genai.configure(api_key=GOOGLE_API_KEY)

# Create model instance
model = genai.GenerativeModel("gemini-2.0-flash")

# Global config
short_config = genai.types.GenerationConfig(max_output_tokens=200)

In [17]:
# Agents function for both kaggle and user inputed csv:
def agent_loop(df, conn, table_name="data"):
    print("🔍 Ask a question about the dataset (or type 'exit'):")

    # Dynamically build schema from df
    schema = f"Table: {table_name}\nColumns:\n"
    for col in df.columns:
        schema += f"- {col}\n"

    # Start interaction loop
    while True:
        user_input = input("\n🧍 You: ")
        if user_input.lower() == "exit":
            break

        # Prompt for SQL
        prompt = f"""
        You are a helpful assistant that answers data questions by generating SQL queries.
        Here is the table schema:
        {schema}
        Question: {user_input}
        Only respond with a valid SQL query.
        """
        response = model.generate_content(prompt, generation_config=short_config)
        sql = response.text.strip().replace("```sql", "").replace("```", "").strip()
        print(f"\n🧾 Cleaned SQL:\n{sql}")

        try:
            result = execute_query(conn, sql)
            print("\n📊 Query Results:")
            for row in result:
                print(row)

            # Explanation
            summary_prompt = f"""
            Here is the result of the SQL query:
            {result}
            Explain this result in plain English for a data analyst.
            """
            summary = model.generate_content(summary_prompt, generation_config=short_config)
            print(f"\n🗣️ Summary:\n{summary.text}")
        except Exception as e:
            print(f"❌ Error: {e}")


## 5. 🤖 Agent Mode: Chat with Your Data W kaggle data and input data

In [None]:
agent_loop(data, kaggle_conn)

In [18]:
"""Uncomment the line below to execute the agent_loop on the user-uploaded dataset."""
agent_loop(df, user_conn, table_name="df")

🔍 Ask a question about the dataset (or type 'exit'):

🧾 Cleaned SQL:
SELECT * FROM df WHERE Loan_Status = 'Charged Off'
 - DB CALL: execute_query(SELECT * FROM df WHERE Loan_Status = 'Charged Off')

📊 Query Results:
('6cf51492-02a2-423e-b93d-676f05b9ad53', '7c202b37-2add-44e8-9aea-d5b119aea935', 'Charged Off', 12232, 'Short Term', 7280.0, '< 1 year', 'Rent', 46643.0, 'Debt Consolidation', '777.39', 18.0, 10.0, 12, 0, 6762, '7946', 0.0, 0.0)
('552e7ade-4292-4354-9ff9-c48031697d72', 'e7217b0a-07ac-47dd-b379-577b5a35b7c6', 'Charged Off', 25014, 'Long Term', 7330.0, '10+ years', 'Home Mortgage', 81099.0, 'Debt Consolidation', '892.09', 26.7, None, 14, 0, 35706, '77961', 0.0, 0.0)
('9b5e32b3-8d76-4801-afc8-d729d5a2e6b9', '0a62fc41-16c8-40b5-92ff-9e4b763ce714', 'Charged Off', 16117, 'Short Term', 7240.0, '9 years', 'Home Mortgage', 60438.0, 'Home Improvements', '1244.02', 16.7, 32.0, 11, 1, 11275, '14815', 1.0, 0.0)
('5419b7c7-ac11-4be2-a8a7-b131fb6d6dbe', '30f36c59-5182-4482-8bbb-5b736849ae

KeyboardInterrupt: 

In [None]:
"""Uncomment the line below to execute a query on the user-uploaded dataset."""
# execute_query()