# Milestone 3: Feature Engineering

This notebook creates additional features from the cleaned dataset
to support exploratory analysis and KPI generation.

The goal is to transform transactional data into an insight-ready format
without altering the original business values.


## Load the clean dataset

In [1]:
import pandas as pd

In [2]:
import pandas as pd

df = pd.read_csv("outputs/globex_retail_clean.csv", parse_dates=["Order_Date"])
df.head()


Unnamed: 0,Customer_ID,Order_ID,Order_Date,Product_Category,Product_Sub_Category,Quantity,Price,Discount,Customer_Location,Revenue,Revenue_calc
0,CUST_013738,ORD_00102406,2023-01-01,Home & Garden,Gardening Tools,1,419.19,0.0,TN,419.19,419.19
1,CUST_011726,ORD_00102902,2023-01-01,Electronics,Laptops,1,222.37,0.09,TN,202.3567,202.3567
2,CUST_010891,ORD_00103864,2023-01-01,Electronics,Laptops,6,1107.65,0.0,IN,6645.9,6645.9
3,CUST_011452,ORD_00103560,2023-01-01,Electronics,Gaming Consoles,5,288.84,0.0,MA,1444.2,1444.2
4,CUST_010886,ORD_00100632,2023-01-02,Electronics,Headphones,1,191.27,0.0,AZ,191.27,191.27


## Time-based features
these allow: monthly trends, weekday patterns, or seasonal analysis

In [3]:
df["order_year"] = df["Order_Date"].dt.year
df["order_month"] = df["Order_Date"].dt.month
df["order_month_name"] = df["Order_Date"].dt.month_name()
df["order_day_of_week"] = df["Order_Date"].dt.day_name()
df.head()

Unnamed: 0,Customer_ID,Order_ID,Order_Date,Product_Category,Product_Sub_Category,Quantity,Price,Discount,Customer_Location,Revenue,Revenue_calc,order_year,order_month,order_month_name,order_day_of_week
0,CUST_013738,ORD_00102406,2023-01-01,Home & Garden,Gardening Tools,1,419.19,0.0,TN,419.19,419.19,2023,1,January,Sunday
1,CUST_011726,ORD_00102902,2023-01-01,Electronics,Laptops,1,222.37,0.09,TN,202.3567,202.3567,2023,1,January,Sunday
2,CUST_010891,ORD_00103864,2023-01-01,Electronics,Laptops,6,1107.65,0.0,IN,6645.9,6645.9,2023,1,January,Sunday
3,CUST_011452,ORD_00103560,2023-01-01,Electronics,Gaming Consoles,5,288.84,0.0,MA,1444.2,1444.2,2023,1,January,Sunday
4,CUST_010886,ORD_00100632,2023-01-02,Electronics,Headphones,1,191.27,0.0,AZ,191.27,191.27,2023,1,January,Monday


## Order-level metrics (clarity, not recalculation)

In [5]:
df['gross_sales'] = df['Quantity'] * df['Price']
df['discount_amount'] = df['gross_sales'] * df['Discount']
df['net_sales'] = df['Revenue'] #Revenue is referenced, not overwritten.
df.head()

Unnamed: 0,Customer_ID,Order_ID,Order_Date,Product_Category,Product_Sub_Category,Quantity,Price,Discount,Customer_Location,Revenue,Revenue_calc,order_year,order_month,order_month_name,order_day_of_week,gross_sales,discount_amount,net_sales
0,CUST_013738,ORD_00102406,2023-01-01,Home & Garden,Gardening Tools,1,419.19,0.0,TN,419.19,419.19,2023,1,January,Sunday,419.19,0.0,419.19
1,CUST_011726,ORD_00102902,2023-01-01,Electronics,Laptops,1,222.37,0.09,TN,202.3567,202.3567,2023,1,January,Sunday,222.37,20.0133,202.3567
2,CUST_010891,ORD_00103864,2023-01-01,Electronics,Laptops,6,1107.65,0.0,IN,6645.9,6645.9,2023,1,January,Sunday,6645.9,0.0,6645.9
3,CUST_011452,ORD_00103560,2023-01-01,Electronics,Gaming Consoles,5,288.84,0.0,MA,1444.2,1444.2,2023,1,January,Sunday,1444.2,0.0,1444.2
4,CUST_010886,ORD_00100632,2023-01-02,Electronics,Headphones,1,191.27,0.0,AZ,191.27,191.27,2023,1,January,Monday,191.27,0.0,191.27


## Customer-level Aggregation 
creating customer value features

In [6]:
customer_metrics = (
    df.groupby("Customer_ID")
    .agg(
        total_orders=pd.NamedAgg(column="Order_ID", aggfunc="nunique"),
        total_quantity=pd.NamedAgg(column="Quantity", aggfunc="sum"),
        total_revenue=pd.NamedAgg(column="net_sales", aggfunc="sum"),
        avg_order_value=pd.NamedAgg(column="net_sales", aggfunc="mean"),
    )
    .reset_index()
)
customer_metrics.head()

Unnamed: 0,Customer_ID,total_orders,total_quantity,total_revenue,avg_order_value
0,CUST_010001,2,7,2248.477,1124.2385
1,CUST_010002,1,2,981.44,981.44
2,CUST_010003,2,4,933.087,466.5435
3,CUST_010006,3,8,1358.8316,452.943867
4,CUST_010007,2,5,413.376,206.688


## High-value customer flag
Defining high value customers by total revenue

In [14]:
revenue_threushold = customer_metrics["total_revenue"].quantile(0.80)
customer_metrics['customer_segment'] = customer_metrics['total_revenue'].apply(
    lambda x: 'High_Value' if x >= revenue_threushold else 'Standard'
)
#customer_metrics.head()
customer_metrics['customer_segment'].value_counts()

customer_segment
Standard      2562
High_Value     641
Name: count, dtype: int64

## Join customer features back to transactions
every transactions to determines if it belongs to a high-value customer

In [15]:
df = df.merge(
    customer_metrics[["Customer_ID", "customer_segment"]],
    on="Customer_ID",
    how="left"
)
df.head()

Unnamed: 0,Customer_ID,Order_ID,Order_Date,Product_Category,Product_Sub_Category,Quantity,Price,Discount,Customer_Location,Revenue,Revenue_calc,order_year,order_month,order_month_name,order_day_of_week,gross_sales,discount_amount,net_sales,customer_segment_x,customer_segment_y
0,CUST_013738,ORD_00102406,2023-01-01,Home & Garden,Gardening Tools,1,419.19,0.0,TN,419.19,419.19,2023,1,January,Sunday,419.19,0.0,419.19,Standard,Standard
1,CUST_011726,ORD_00102902,2023-01-01,Electronics,Laptops,1,222.37,0.09,TN,202.3567,202.3567,2023,1,January,Sunday,222.37,20.0133,202.3567,Standard,Standard
2,CUST_010891,ORD_00103864,2023-01-01,Electronics,Laptops,6,1107.65,0.0,IN,6645.9,6645.9,2023,1,January,Sunday,6645.9,0.0,6645.9,High-Value,High_Value
3,CUST_011452,ORD_00103560,2023-01-01,Electronics,Gaming Consoles,5,288.84,0.0,MA,1444.2,1444.2,2023,1,January,Sunday,1444.2,0.0,1444.2,Standard,Standard
4,CUST_010886,ORD_00100632,2023-01-02,Electronics,Headphones,1,191.27,0.0,AZ,191.27,191.27,2023,1,January,Monday,191.27,0.0,191.27,High-Value,High_Value


## Save Feature-engineered dataset

In [16]:
df.to_csv("outputs/globex_retail_features.csv", index=False)
customer_metrics.to_csv("outputs/globex_retail_customer_metrics.csv", index=False)