# Brazilian E-Commerce Data Analysis

This notebook performs exploratory data analysis (EDA) on the Olist E-Commerce dataset using the data warehouse built in BigQuery.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from dotenv import load_dotenv

# Load environment variables
load_dotenv('../.env')

# Configure plotting
plt.style.use('ggplot')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

# Initialize BigQuery client
project_id = os.getenv('GCP_PROJECT_ID')
client = bigquery.Client(project=project_id)

## 1. Monthly Sales Trend

In [None]:
query = """
SELECT
    FORMAT_DATE('%Y-%m', order_purchase_date) as month,
    SUM(total_order_value) as total_sales,
    COUNT(DISTINCT order_id) as total_orders
FROM `" + os.getenv('BQ_DATASET_WAREHOUSE') + ".fact_orders`
WHERE order_status = 'delivered'
GROUP BY 1
ORDER BY 1
"""

df_sales = client.query(query).to_dataframe()

plt.figure(figsize=(14, 6))
sns.lineplot(data=df_sales, x='month', y='total_sales', marker='o')
plt.title('Monthly Sales Trend')
plt.xticks(rotation=45)
plt.ylabel('Total Sales (BRL)')
plt.tight_layout()
plt.show()

## 2. Top Selling Products

In [None]:
query = """
SELECT
    product_category_name,
    SUM(total_revenue) as revenue,
    SUM(total_orders) as orders
FROM `" + os.getenv('BQ_DATASET_WAREHOUSE') + ".dim_product`
GROUP BY 1
ORDER BY revenue DESC
LIMIT 10
"""

df_products = client.query(query).to_dataframe()

plt.figure(figsize=(12, 6))
sns.barplot(data=df_products, y='product_category_name', x='revenue')
plt.title('Top 10 Product Categories by Revenue')
plt.xlabel('Revenue (BRL)')
plt.ylabel('Category')
plt.show()

## 3. Customer Segmentation (RFM Analysis)

In [None]:
query = """
SELECT
    customer_segment,
    COUNT(customer_id) as customer_count,
    AVG(total_orders) as avg_orders
FROM `" + os.getenv('BQ_DATASET_WAREHOUSE') + ".dim_customer`
GROUP BY 1
ORDER BY customer_count DESC
"""

df_segment = client.query(query).to_dataframe()

plt.figure(figsize=(10, 6))
plt.pie(df_segment['customer_count'], labels=df_segment['customer_segment'], autopct='%1.1f%%')
plt.title('Customer Segmentation Distribution')
plt.show()

## 4. Delivery Performance

In [None]:
query = """
SELECT
    customer_state,
    AVG(delivery_days) as avg_delivery_days
FROM `" + os.getenv('BQ_DATASET_WAREHOUSE') + ".fact_orders`
JOIN `" + os.getenv('BQ_DATASET_WAREHOUSE') + ".dim_customer` USING(customer_id)
WHERE order_status = 'delivered'
GROUP BY 1
ORDER BY avg_delivery_days DESC
"""

df_delivery = client.query(query).to_dataframe()

plt.figure(figsize=(14, 6))
sns.barplot(data=df_delivery, x='customer_state', y='avg_delivery_days')
plt.title('Average Delivery Days by State')
plt.ylabel('Days')
plt.show()