# 1. 🔧 Setup: Libraries, API Key, and Imports

In [1]:
# Core Libraries
import pandas as pd
import sqlite3
import io
import warnings

In [2]:
# Widgets for file upload
from IPython.display import display
import ipywidgets as widgets

In [3]:
# Gemini API Setup
from google import genai
from google.genai import types
%pip install google.api_core
from google.api_core import retry
from google.api_core.exceptions import GoogleAPIError
from google.generativeai.types import GenerationConfig


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
# 🔧 Setup: Load API Key + Gemini Configuration
import os
from dotenv import load_dotenv
%pip install google.generativeai
import google.generativeai as genai

# Load your .env file
load_dotenv()

# Grab the key
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Configure Gemini
genai.configure(api_key=GOOGLE_API_KEY)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
from google.api_core import retry

# Retry handler for transient API errors
is_retriable = lambda e: (
    isinstance(e, genai.errors.APIError) and e.code in {429, 503}
)

genai.GenerativeModel.generate_content = retry.Retry(
    predicate=is_retriable
)(genai.GenerativeModel.generate_content)


# 2. 📂 Load a Dataset (Kaggle OR Upload your own CSV)

## Loading kaggle data

In [6]:
# Data Set: 
data = pd.read_csv('amazon_sales_data 2025.csv')
data

Unnamed: 0,Order ID,Date,Product,Category,Price,Quantity,Total Sales,Customer Name,Customer Location,Payment Method,Status
0,ORD0001,14-03-25,Running Shoes,Footwear,60,3,180,Emma Clark,New York,Debit Card,Cancelled
1,ORD0002,20-03-25,Headphones,Electronics,100,4,400,Emily Johnson,San Francisco,Debit Card,Pending
2,ORD0003,15-02-25,Running Shoes,Footwear,60,2,120,John Doe,Denver,Amazon Pay,Cancelled
3,ORD0004,19-02-25,Running Shoes,Footwear,60,3,180,Olivia Wilson,Dallas,Credit Card,Pending
4,ORD0005,10-03-25,Smartwatch,Electronics,150,3,450,Emma Clark,New York,Debit Card,Pending
...,...,...,...,...,...,...,...,...,...,...,...
245,ORD0246,17-03-25,T-Shirt,Clothing,20,2,40,Daniel Harris,Miami,Debit Card,Cancelled
246,ORD0247,30-03-25,Jeans,Clothing,40,1,40,Sophia Miller,Dallas,Debit Card,Cancelled
247,ORD0248,05-03-25,T-Shirt,Clothing,20,2,40,Chris White,Denver,Debit Card,Cancelled
248,ORD0249,08-03-25,Smartwatch,Electronics,150,3,450,Emily Johnson,New York,Debit Card,Cancelled


## Loading input data

> 👉 **Tip**: You can upload your own `.csv` file using the widget below. If you skip this step, the notebook will use a default Amazon Sales dataset.


In [7]:
# Upload CSV widget
upload = widgets.FileUpload(accept='.csv', multiple=False)
display(upload)

# Handle upload
def handle_upload():
    if upload.value:
        for filename in upload.value:
            content = upload.value[filename]['content']
            df = pd.read_csv(io.BytesIO(content))
            df.columns = [col.strip().replace(" ", "_") for col in df.columns]
            return df
    return None  # No file uploaded

# Try user upload
df = handle_upload()

# Fallback if nothing is uploaded
if df is None:
    print("⚠️ No user file uploaded — using default Kaggle dataset instead.")
    df = pd.read_csv('amazon_sales_data 2025.csv')
    df.columns = [col.strip().replace(" ", "_") for col in df.columns]

df.head()

FileUpload(value=(), accept='.csv', description='Upload')

⚠️ No user file uploaded — using default Kaggle dataset instead.


Unnamed: 0,Order_ID,Date,Product,Category,Price,Quantity,Total_Sales,Customer_Name,Customer_Location,Payment_Method,Status
0,ORD0001,14-03-25,Running Shoes,Footwear,60,3,180,Emma Clark,New York,Debit Card,Cancelled
1,ORD0002,20-03-25,Headphones,Electronics,100,4,400,Emily Johnson,San Francisco,Debit Card,Pending
2,ORD0003,15-02-25,Running Shoes,Footwear,60,2,120,John Doe,Denver,Amazon Pay,Cancelled
3,ORD0004,19-02-25,Running Shoes,Footwear,60,3,180,Olivia Wilson,Dallas,Credit Card,Pending
4,ORD0005,10-03-25,Smartwatch,Electronics,150,3,450,Emma Clark,New York,Debit Card,Pending


# 3. 🧠 Ask AI: What Should We Explore?

## prompt for the kaggle data set

In [8]:
# Create the model
model = genai.GenerativeModel("gemini-2.0-flash")

## Prompt example
sample = data.head(5).to_markdown()
prompt = f"""
Here is a few rows of our dataset:

{sample}

Based on this dataset, what are some useful questions we should ask during further data analysis?
"""

# Config
short_config = GenerationConfig(max_output_tokens=200)

# Generate
response = model.generate_content(prompt, generation_config=short_config)
print(response.text)


Okay, based on the provided dataset columns, here are some useful questions we can ask during further data analysis. These questions are grouped by category for clarity:

**Sales Performance & Trends:**

*   **Overall Sales:**
    *   What is the total revenue generated?
    *   What is the average order value?
    *   What is the distribution of order values? (Are there many small orders and few large ones, or is it more balanced?)
*   **Time-Based Analysis:**
    *   What are the monthly/weekly/daily sales trends?  Is there seasonality?
    *   How do sales fluctuate over time? Are there any significant peaks or dips?
    *   Are there any specific dates or periods with unusually high or low sales?  Why?
*   **Product Performance:**
    *   Which products generate the most revenue?
    *   Which products are most frequently ordered?
    *   What is


In [9]:
%pip install tabulate

# Create the model
model = genai.GenerativeModel("gemini-2.0-flash")

# Use the model for the prompt:
sample = data.head(5).to_markdown()
prompt = f"""
Here is a few rows of our dataset:

{sample}

Based on this dataset, what are some useful questions we should ask during further data analysis?
"""

# Config
short_config = GenerationConfig(max_output_tokens=200)

# Generate
response = model.generate_content(prompt, generation_config=short_config)
print(response.text)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Okay, based on the provided dataset columns, here are some useful questions to ask during further data analysis, categorized for clarity:

**I. Sales Performance & Trends:**

*   **Overall Sales:**
    *   What is the total revenue generated?
    *   What is the average order value?
    *   What are the minimum and maximum order values?
    *   How are sales distributed across different regions?
*   **Temporal Analysis:**
    *   What are the monthly, quarterly, or yearly sales trends? (Requires analyzing the 'Date' column)
    *   Are there any seasonal patterns in sales?
    *   Has the number of orders changed over the period shown in the data?
*   **Product Performance:

## Prompt for the input data

In [10]:
# Prompt:
sample_1 = df.head(5).to_markdown()

prompt_1 = f"""
Here is a few rows of our dataset:

{sample_1}

Based on this dataset, what are some useful questions we should ask during further data analysis?
"""

# Config
short_config = GenerationConfig(max_output_tokens=200)

# Generate
response = model.generate_content(prompt, generation_config=short_config)
print(response.text)


Okay, based on the sample data, here are some useful questions to ask during further data analysis, categorized for clarity:

**Sales Performance & Trends:**

*   **Overall Sales Performance:**
    *   What is the total revenue generated across the entire dataset?
    *   What is the average order value?
    *   What is the distribution of order values? (Are there many small orders or a few large ones?)
    *   How many orders were placed in total?

*   **Temporal Trends:**
    *   What are the sales trends over time (e.g., daily, weekly, monthly)? Is there seasonality?
    *   Which months or periods have the highest/lowest sales?
    *   Has sales performance improved or declined over time?

*   **Product Performance:**
    *   Which products generate the most revenue?
    *   Which products have the highest quantity sold?
    *   What is the average


# 4. 🧾 Generate & Run SQL Queries from Natural Language

In [11]:
# description function for both kaggle and user inputed csv:
def describe_table(conn, table_name: str):
    cursor = conn.cursor()
    cursor.execute(f"PRAGMA table_info({table_name});")
    return [(col[1], col[2]) for col in cursor.fetchall()]

In [12]:
# Query function for both kaggle and user inputed csv: 
def execute_query(conn, sql: str) -> list[list[str]]:
    print(f' - DB CALL: execute_query({sql})')
    cursor = conn.cursor()
    cursor.execute(sql)
    return cursor.fetchall()

## running SQL Queries for the kaggle data set

In [13]:
kaggle_conn = sqlite3.connect("sample.db")
data.columns = [col.strip().replace(" ", "_") for col in data.columns]
data.to_sql("data", kaggle_conn, if_exists="replace", index=False)

250

In [14]:
describe_table(kaggle_conn, "data")

[('Order_ID', 'TEXT'),
 ('Date', 'TEXT'),
 ('Product', 'TEXT'),
 ('Category', 'TEXT'),
 ('Price', 'INTEGER'),
 ('Quantity', 'INTEGER'),
 ('Total_Sales', 'INTEGER'),
 ('Customer_Name', 'TEXT'),
 ('Customer_Location', 'TEXT'),
 ('Payment_Method', 'TEXT'),
 ('Status', 'TEXT')]

In [15]:
execute_query(kaggle_conn, "select * from data where Category == 'Footwear'")

 - DB CALL: execute_query(select * from data where Category == 'Footwear')


[('ORD0001',
  '14-03-25',
  'Running Shoes',
  'Footwear',
  60,
  3,
  180,
  'Emma Clark',
  'New York',
  'Debit Card',
  'Cancelled'),
 ('ORD0003',
  '15-02-25',
  'Running Shoes',
  'Footwear',
  60,
  2,
  120,
  'John Doe',
  'Denver',
  'Amazon Pay',
  'Cancelled'),
 ('ORD0004',
  '19-02-25',
  'Running Shoes',
  'Footwear',
  60,
  3,
  180,
  'Olivia Wilson',
  'Dallas',
  'Credit Card',
  'Pending'),
 ('ORD0019',
  '22-03-25',
  'Running Shoes',
  'Footwear',
  60,
  3,
  180,
  'Olivia Wilson',
  'Houston',
  'Credit Card',
  'Completed'),
 ('ORD0046',
  '06-03-25',
  'Running Shoes',
  'Footwear',
  60,
  2,
  120,
  'David Lee',
  'Houston',
  'Debit Card',
  'Cancelled'),
 ('ORD0053',
  '24-03-25',
  'Running Shoes',
  'Footwear',
  60,
  4,
  240,
  'Emily Johnson',
  'Los Angeles',
  'PayPal',
  'Completed'),
 ('ORD0079',
  '09-03-25',
  'Running Shoes',
  'Footwear',
  60,
  2,
  120,
  'Emily Johnson',
  'Denver',
  'Gift Card',
  'Cancelled'),
 ('ORD0080',
  '23-02

## Running SQL Queries from the input data

In [16]:
user_conn = sqlite3.connect("sample_1.db")
df.columns = [col.strip().replace(" ", "_") for col in df.columns]
df.to_sql("df", user_conn, if_exists="replace", index=False)

250

In [17]:
describe_table(user_conn, "df")

[('Order_ID', 'TEXT'),
 ('Date', 'TEXT'),
 ('Product', 'TEXT'),
 ('Category', 'TEXT'),
 ('Price', 'INTEGER'),
 ('Quantity', 'INTEGER'),
 ('Total_Sales', 'INTEGER'),
 ('Customer_Name', 'TEXT'),
 ('Customer_Location', 'TEXT'),
 ('Payment_Method', 'TEXT'),
 ('Status', 'TEXT')]

In [18]:
print("📂 Default dataset loaded into SQLite as 'data'")
print("📂 User-uploaded dataset loaded into SQLite as 'df'")

📂 Default dataset loaded into SQLite as 'data'
📂 User-uploaded dataset loaded into SQLite as 'df'


# 5. Agents

In [19]:
from tabulate import tabulate
import google.generativeai as genai

# Make sure you have your API key configured somewhere before this
genai.configure(api_key=GOOGLE_API_KEY)

# Create model instance
model = genai.GenerativeModel("gemini-2.0-flash")

# Global config
short_config = genai.types.GenerationConfig(max_output_tokens=200)

In [20]:
# Agents function for both kaggle and user inputed csv:
def agent_loop(df, conn, table_name="data"):
    print("🔍 Ask a question about the dataset (or type 'exit'):")

    # Dynamically build schema from df
    schema = f"Table: {table_name}\nColumns:\n"
    for col in df.columns:
        schema += f"- {col}\n"

    # Start interaction loop
    while True:
        user_input = input("\n🧍 You: ")
        if user_input.lower() == "exit":
            break

        # Prompt for SQL
        prompt = f"""
        You are a helpful assistant that answers data questions by generating SQL queries.
        Here is the table schema:
        {schema}
        Question: {user_input}
        Only respond with a valid SQL query.
        """
        response = model.generate_content(prompt, generation_config=short_config)
        sql = response.text.strip().replace("```sql", "").replace("```", "").strip()
        print(f"\n🧾 Cleaned SQL:\n{sql}")

        try:
            result = execute_query(conn, sql)
            print("\n📊 Query Results:")
            for row in result:
                print(row)

            # Explanation
            summary_prompt = f"""
            Here is the result of the SQL query:
            {result}
            Explain this result in plain English for a data analyst.
            """
            summary = model.generate_content(summary_prompt, generation_config=short_config)
            print(f"\n🗣️ Summary:\n{summary.text}")
        except Exception as e:
            print(f"❌ Error: {e}")


## 5. 🤖 Agent Mode: Chat with Your Data W kaggle data and input data

In [21]:
agent_loop(data, kaggle_conn)

🔍 Ask a question about the dataset (or type 'exit'):

🧾 Cleaned SQL:
SELECT sum(Total_Sales) FROM data;
 - DB CALL: execute_query(SELECT sum(Total_Sales) FROM data;)

📊 Query Results:
(243845,)

🗣️ Summary:
Okay, as a data analyst, here's how to interpret the SQL query result `[(243845,)]` in plain English:

**"The SQL query you ran returned a single result set.  This result set contains one row and one column. The value in that single cell is 243845."**

**Further Breakdown & Possible Interpretations (depending on the query):**

*   **Single Result:** The query likely aggregated some data down to a single number.  For example, it could be:
    *   **A count:** The total number of records that met certain criteria (e.g., "The total number of customers in California is 243845").
    *   **A sum:** The total value of something across your dataset (e.g., "The total revenue generated last quarter was $243,845").
    *   **An average:**


In [22]:
"""Uncomment the line below to execute the agent_loop on the user-uploaded dataset."""
# agent_loop(df, user_conn, table_name="df")

'Uncomment the line below to execute the agent_loop on the user-uploaded dataset.'

In [23]:
"""Uncomment the line below to execute a query on the user-uploaded dataset."""
# execute_query()

'Uncomment the line below to execute a query on the user-uploaded dataset.'