In [1]:
import sqlite3
import pandas as pd
import os

# Path to SQLite database
db_path = "../data/processed/sales.db"

# Verify database exists
if not os.path.exists(db_path):
    raise FileNotFoundError("sales.db not found in ../data/processed/. Please check the file path.")

# Connect to the database
conn = sqlite3.connect(db_path)
print("âœ… Connected successfully to:", db_path)

# Display all available tables
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)
print("Tables in the database:")
display(tables)


âœ… Connected successfully to: ../data/processed/sales.db
Tables in the database:


Unnamed: 0,name
0,sales


In [2]:
# Read a few sample rows to understand structure
df_preview = pd.read_sql("SELECT * FROM sales LIMIT 5;", conn)
print("Preview of data in 'sales' table:")
display(df_preview)


Preview of data in 'sales' table:


Unnamed: 0,Order ID,Date,Status,Fulfilment,Sales Channel,ship-service-level,Style,SKU,Category,Size,...,B2B,fulfilled-by,Amount_raw,Total_Amount,Unit_Price,Year,Month,Day,Month_Year,Order_MonthStart
0,171-9198151-1101146,2022-04-30 00:00:00,Shipped - Delivered to Buyer,Merchant,Amazon.in,Standard,JNE3781,JNE3781-KR-XXXL,Kurta,3XL,...,0,Easy Ship,406.0,406.0,406.0,2022,4,30,2022-04,2022-04-01 00:00:00
1,404-0687676-7273146,2022-04-30 00:00:00,Shipped,Amazon,Amazon.in,Expedited,JNE3371,JNE3371-KR-XL,Kurta,XL,...,1,Unknown,329.0,329.0,329.0,2022,4,30,2022-04,2022-04-01 00:00:00
2,407-1069790-7240320,2022-04-30 00:00:00,Shipped,Amazon,Amazon.in,Expedited,JNE3671,JNE3671-TU-XXXL,Top,3XL,...,0,Unknown,574.0,574.0,574.0,2022,4,30,2022-04,2022-04-01 00:00:00
3,404-1490984-4578765,2022-04-30 00:00:00,Shipped,Amazon,Amazon.in,Expedited,SET264,SET264-KR-NP-XL,Set,XL,...,0,Unknown,824.0,824.0,824.0,2022,4,30,2022-04,2022-04-01 00:00:00
4,408-5748499-6859555,2022-04-30 00:00:00,Shipped,Amazon,Amazon.in,Expedited,J0095,J0095-SET-L,Set,L,...,0,Unknown,653.0,653.0,653.0,2022,4,30,2022-04,2022-04-01 00:00:00


In [3]:
query1 = """
SELECT Category, SUM(Total_Amount) AS total_revenue
FROM sales
GROUP BY Category
ORDER BY total_revenue DESC
LIMIT 10;
"""
top_categories = pd.read_sql(query1, conn)
print("Top 10 Selling Categories by Revenue:")
display(top_categories)

# Save output for Power BI or reporting
top_categories.to_csv("../data/processed/sql_top_categories.csv", index=False)


Top 10 Selling Categories by Revenue:


Unnamed: 0,Category,total_revenue
0,Set,37660322.0
1,Kurta,20451608.0
2,Western Dress,10629096.0
3,Top,5203733.0
4,Ethnic Dress,760711.0
5,Blouse,434751.0
6,Bottom,140226.0
7,Saree,118509.0
8,Dupatta,915.0


In [4]:
query2 = """
SELECT strftime('%Y-%m', Date) AS Month, SUM(Total_Amount) AS Revenue
FROM sales
GROUP BY Month
ORDER BY Month;
"""
monthly_revenue = pd.read_sql(query2, conn)
print("Monthly Revenue Trend:")
display(monthly_revenue)

# Save for Power BI visualization
monthly_revenue.to_csv("../data/processed/sql_monthly_revenue.csv", index=False)


Monthly Revenue Trend:


Unnamed: 0,Month,Revenue
0,2022-03,98261.0
1,2022-04,27581851.0
2,2022-05,25118009.0
3,2022-06,22601750.0


In [5]:
query3 = """
SELECT ROUND(AVG(Total_Amount), 2) AS avg_order_value
FROM sales;
"""
avg_order_value = pd.read_sql(query3, conn)
print("Average Order Value:")
display(avg_order_value)


Average Order Value:


Unnamed: 0,avg_order_value
0,649.77


In [6]:
query4 = """
SELECT [ship-state] AS State, SUM(Total_Amount) AS TotalRevenue
FROM sales
GROUP BY [ship-state]
ORDER BY TotalRevenue DESC;
"""
state_revenue = pd.read_sql(query4, conn)
print("Total Revenue by State:")
display(state_revenue)

# Save to processed data
state_revenue.to_csv("../data/processed/sql_state_revenue.csv", index=False)


Total Revenue by State:


Unnamed: 0,State,TotalRevenue
0,Maharashtra,12863572.0
1,Karnataka,10153100.0
2,Telangana,6642955.0
3,Uttar Pradesh,6493860.0
4,Tamil Nadu,6241913.0
5,Delhi,4185475.0
6,Kerala,3618505.0
7,West Bengal,3357170.0
8,Andhra Pradesh,3049151.0
9,Haryana,2791446.0


In [7]:
query5 = """
SELECT Fulfilment, COUNT(*) AS total_orders
FROM sales
GROUP BY Fulfilment;
"""
fulfilment_orders = pd.read_sql(query5, conn)
print("Number of Orders per Fulfilment Type:")
display(fulfilment_orders)


Number of Orders per Fulfilment Type:


Unnamed: 0,Fulfilment,total_orders
0,Amazon,83853
1,Merchant,32416


In [8]:
query6 = """
SELECT Category, SUM(Qty) AS total_quantity
FROM sales
GROUP BY Category
ORDER BY total_quantity DESC;
"""
category_quantity = pd.read_sql(query6, conn)
print("Total Quantity Sold per Category:")
display(category_quantity)


Total Quantity Sold per Category:


Unnamed: 0,Category,total_quantity
0,Set,45287
1,Kurta,45044
2,Western Dress,13943
3,Top,9903
4,Ethnic Dress,1053
5,Blouse,863
6,Bottom,398
7,Saree,152
8,Dupatta,3


In [9]:
query7 = """
SELECT Date, SUM(Total_Amount) AS Daily_Revenue
FROM sales
GROUP BY Date
ORDER BY Daily_Revenue DESC
LIMIT 1;
"""
highest_day = pd.read_sql(query7, conn)
print("Day with Highest Total Revenue:")
display(highest_day)


Day with Highest Total Revenue:


Unnamed: 0,Date,Daily_Revenue
0,2022-05-04 00:00:00,1165893.0


In [10]:
query8 = """
SELECT Category,
       ROUND(100.0 * SUM(Total_Amount) / (SELECT SUM(Total_Amount) FROM sales), 2) AS Revenue_Percentage
FROM sales
GROUP BY Category
ORDER BY Revenue_Percentage DESC;
"""
category_share = pd.read_sql(query8, conn)
print("Revenue Contribution by Category (%):")
display(category_share)

category_share.to_csv("../data/processed/sql_category_share.csv", index=False)


Revenue Contribution by Category (%):


Unnamed: 0,Category,Revenue_Percentage
0,Set,49.95
1,Kurta,27.12
2,Western Dress,14.1
3,Top,6.9
4,Ethnic Dress,1.01
5,Blouse,0.58
6,Bottom,0.19
7,Saree,0.16
8,Dupatta,0.0


In [11]:
conn.close()
print("ðŸ”’ Database connection closed successfully.")


ðŸ”’ Database connection closed successfully.
