# 🤖 GenAI for Data Analysis: Ask Questions, Get SQL, Understand Results

Welcome! This project demonstrates how to use **Google Gemini AI** to enhance the data analysis process with:
- AI-generated **questions and SQL queries**
- Natural language **summaries of query results**
- Support for **custom CSV uploads** or a built-in **Kaggle dataset**

🎯 Goal: Show how GenAI can act like a smart analyst assistant for any dataset.


# 1. 🔧 Setup: Libraries, API Key, and Imports

In [None]:
# Gemini + API Setup
from google import genai
from google.genai import types
from kaggle_secrets import UserSecretsClient
import warnings

In [None]:
# For API retry handling
from google.api_core import retry

In [None]:
# Displaying Data
import pandas as pd

In [None]:
# SQLite DB
import sqlite3

In [None]:
# Librairies for the inputed csv:
from IPython.display import display
import ipywidgets as widgets
import io

In [None]:
# API Error 
is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable)(genai.models.Models.generate_content)

In [None]:
# getting API key
GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
client = genai.Client(api_key=GOOGLE_API_KEY)

# 2. 📂 Load a Dataset (Kaggle OR Upload your own CSV)

## Loading kaggle data

In [None]:
# Data Set: 
data = pd.read_csv('/kaggle/input/amazon-sales-2025/amazon_sales_data 2025.csv')
data

## Loading input data

> 👉 **Tip**: You can upload your own `.csv` file using the widget below. If you skip this step, the notebook will use a default Amazon Sales dataset.


In [None]:
# Upload CSV widget
upload = widgets.FileUpload(accept='.csv', multiple=False)
display(upload)

# Handle upload
def handle_upload():
    if upload.value:
        for filename in upload.value:
            content = upload.value[filename]['content']
            df = pd.read_csv(io.BytesIO(content))
            df.columns = [col.strip().replace(" ", "_") for col in df.columns]
            return df
    return None  # No file uploaded

# Try user upload
df = handle_upload()

# Fallback if nothing is uploaded
if df is None:
    print("⚠️ No user file uploaded — using default Kaggle dataset instead.")
    df = pd.read_csv('/kaggle/input/amazon-sales-2025/amazon_sales_data 2025.csv')
    df.columns = [col.strip().replace(" ", "_") for col in df.columns]


# 3. 🧠 Ask AI: What Should We Explore?

## prompt for the kaggle data set

In [None]:
# Prompt:
sample = data.head(5).to_markdown()  # Only show a small sample in the prompt
prompt = f"""
Here is a a few rows of our dataset:

{sample}

Based on this dataset, what are some useful questions we should ask during further data analysis?
"""

short_config = types.GenerateContentConfig(max_output_tokens=200)

response = client.models.generate_content(
    model='gemini-2.0-flash',
    config=short_config,
    contents=prompt
)
print(response.text)

## Prompt for the input data

In [None]:
# Prompt:
sample_1 = df.head(5).to_markdown()

prompt_1 = f"""
Here is a few rows of our dataset:

{sample_1}

Based on this dataset, what are some useful questions we should ask during further data analysis?
"""

response = client.models.generate_content(
    model='gemini-2.0-flash',
    config=short_config,
    contents=prompt_1
)

print(response.text)


# 4. 🧾 Generate & Run SQL Queries from Natural Language

In [None]:
# description function for both kaggle and user inputed csv:
def describe_table(table_name: str):
    cursor = db_conn.cursor()
    cursor.execute(f"PRAGMA table_info({table_name});")
    return [(col[1], col[2]) for col in cursor.fetchall()]

In [None]:
# Query function for both kaggle and user inputed csv: 
def execute_query(sql: str) -> list[list[str]]:
    """Execute an SQL statement, returning the results."""
    print(f' - DB CALL: execute_query({sql})')

    cursor = db_conn.cursor()

    cursor.execute(sql)
    return cursor.fetchall()

## running SQL Queries for the kaggle data set

In [None]:
# Using sqlite3 to create the database
data.columns = [col.strip().replace(" ", "_") for col in data.columns]  # Clean column names

db_conn = sqlite3.connect("sample.db")
data.to_sql("data", db_conn, if_exists="replace", index=False)


In [None]:
describe_table("data")

In [None]:
execute_query("select * from data where Category == 'Footwear'")

## Running SQL Queries from the input data

In [None]:
# 🛢️ Save uploaded data to SQLite
db_conn = sqlite3.connect("sample_1.db")
df.to_sql("df", db_conn, if_exists="replace", index=False)

In [None]:
describe_table("df")

In [None]:
"""Uncomment the line below to execute a query on the user-uploaded dataset."""
# execute_query()

# 5. Agents

In [None]:
# Agents function for both kaggle and user inputed csv:
def agent_loop(df, table_name="data"):
    print("🔍 Ask a question about the dataset (or type 'exit'):")

    # Dynamically build schema from df
    schema = f"Table: {table_name}\nColumns:\n"
    for col in df.columns:
        schema += f"- {col}\n"

    # Start interaction loop
    while True:
        user_input = input("\n🧍 You: ")
        if user_input.lower() == "exit":
            break

        # Prompt for SQL
        prompt = f"""
        You are a helpful assistant that answers data questions by generating SQL queries.
        Here is the table schema:
        {schema}
        Question: {user_input}
        Only respond with a valid SQL query.
        """
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            config=short_config,
            contents=prompt
        )
        sql = response.text.strip().replace("```sql", "").replace("```", "").strip()
        print(f"\n🧾 Cleaned SQL:\n{sql}")

        try:
            result = execute_query(sql)
            print("\n📊 Query Results:")
            for row in result:
                print(row)

            # Explanation
            summary_prompt = f"""
            Here is the result of the SQL query:
            {result}
            Explain this result in plain English for a data analyst.
            """
            summary = client.models.generate_content(
                model='gemini-2.0-flash',
                config=short_config,
                contents=summary_prompt
            )
            print(f"\n🗣️ Summary:\n{summary.text}")
        except Exception as e:
            print(f"❌ Error: {e}")


## 5. 🤖 Agent Mode: Chat with Your Data W kaggle data and input data

In [None]:
agent_loop(data) 

In [None]:
"""Uncomment the line below to execute the agent_loop on the user-uploaded dataset."""
# agent_loop(df, table_name="df")  