In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [23]:
df = pd.read_csv('ecommerce_sales_raw.csv', parse_dates=['date'])
print(f"Dataset shape: {df.shape}")
print(f"\nData types:\n{df.dtypes}")

Dataset shape: (43786, 11)

Data types:
transaction_id            object
date              datetime64[ns]
customer_id               object
product_name              object
category                  object
quantity                   int64
unit_price               float64
total_amount             float64
region                    object
payment_method            object
shipping_cost            float64
dtype: object


In [24]:
print("Missing values:")
print(df.isnull().sum())
print(f"\nDuplicates: {df.duplicated().sum()} rows")
print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")

Missing values:
transaction_id      0
date                0
customer_id         0
product_name        0
category            0
quantity            0
unit_price          0
total_amount        0
region            438
payment_method    437
shipping_cost       0
dtype: int64

Duplicates: 50 rows

Date range: 2022-01-01 00:00:00 to 2023-12-31 00:00:00


In [25]:
df_clean = df.copy()
df_clean = df_clean.drop_duplicates()

df_clean['region'].fillna(df_clean['region'].mode()[0], inplace=True)
df_clean['payment_method'].fillna(df_clean['payment_method'].mode()[0], inplace=True)

df_clean['year'] = df_clean['date'].dt.year
df_clean['month'] = df_clean['date'].dt.month
df_clean['month_name'] = df_clean['date'].dt.month_name()
df_clean['quarter'] = df_clean['date'].dt.quarter
df_clean['day_of_week'] = df_clean['date'].dt.day_name()
df_clean['is_weekend'] = df_clean['date'].dt.dayofweek.isin([5, 6]).astype(int)
df_clean['total_revenue'] = df_clean['total_amount'] + df_clean['shipping_cost']

print(f"Cleaned dataset: {df_clean.shape}")
print(f"Missing values: {df_clean.isnull().sum().sum()}")

Cleaned dataset: (43736, 18)
Missing values: 0


In [26]:
total_revenue = df_clean['total_revenue'].sum()
total_transactions = len(df_clean)
avg_order_value = df_clean['total_revenue'].mean()
unique_customers = df_clean['customer_id'].nunique()

metrics_df = pd.DataFrame({
    'Metric': ['Total Revenue', 'Total Transactions', 'Average Order Value', 'Unique Customers', 'Transactions per Customer'],
    'Value': [
        f"${total_revenue:,.2f}",
        f"{total_transactions:,}",
        f"${avg_order_value:.2f}",
        f"{unique_customers:,}",
        f"{total_transactions/unique_customers:.2f}"
    ]
})
print(metrics_df.to_string(index=False))

                   Metric         Value
            Total Revenue $3,590,832.24
       Total Transactions        43,736
      Average Order Value        $82.10
         Unique Customers         4,001
Transactions per Customer         10.93


In [27]:
category_revenue = df_clean.groupby('category')['total_revenue'].agg(['sum', 'count', 'mean']).round(2)
category_revenue.columns = ['Total_Revenue', 'Transactions', 'Avg_Revenue']
category_revenue = category_revenue.sort_values('Total_Revenue', ascending=False)
category_revenue['Revenue_Share_%'] = (category_revenue['Total_Revenue'] / category_revenue['Total_Revenue'].sum() * 100).round(2)
print("Revenue by Category:")
print(category_revenue)

Revenue by Category:
             Total_Revenue  Transactions  Avg_Revenue  Revenue_Share_%
category                                                              
Home            1170516.02         11002       106.39            32.60
Electronics     1141369.16         14736        77.45            31.79
Sports           890580.04         10877        81.88            24.80
Office           388367.02          7121        54.54            10.82
