# Import necessary packages

In [1]:
import pandas as pd

# Load the dataset

In [2]:
df = pd.read_csv("../data/cleaned/sales_data_cleaned.csv")

In [3]:
df['OrderDate'] = pd.to_datetime(df['OrderDate'])

# Time-Based Features

In [4]:
df['Year'] = df['OrderDate'].dt.year
df['Month'] = df['OrderDate'].dt.month
df['Day'] = df['OrderDate'].dt.day
df['Weekday'] = df['OrderDate'].dt.day_name()

# Revenue Quality Features

## Revenue per item

In [5]:
df['RevenuePerItem'] = df['Revenue'] / df['Quantity']

## Discount impact

In [6]:
df['DiscountAmount'] = df['UnitPrice'] * df['Quantity'] * df['Discount']

# Customer-Level Features

## Orders per customer

In [None]:
customer_orders = df.groupby('CustomerID')['OrderID'].count()
df['OrdersPerCustomer'] = df['CustomerID'].map(customer_orders)

## Customer Lifetime Value

In [8]:
customer_ltv = df.groupby('CustomerID')['Revenue'].sum()
df['CustomerLifetimeValue'] = df['CustomerID'].map(customer_ltv)

# High / Low Revenue Flag

In [9]:
threshold = df['Revenue'].median()
df['HighRevenueOrder'] = df['Revenue'] > threshold

# Validate New Features

In [10]:
df[['Revenue', 'RevenuePerItem', 'DiscountAmount']].describe()

Unnamed: 0,Revenue,RevenuePerItem,DiscountAmount
count,1000.0,1000.0,1000.0
mean,112527.41049,22108.9445,19680.884049
std,90902.374225,12502.727484,22173.34651
min,865.5,190.963333,0.0
25%,36313.37,11491.64625,4271.2726
50%,89377.245,22167.019111,11661.5592
75%,173184.9925,31843.9175,26366.95675
max,438092.4,78755.64,121480.749


In [11]:
df[['OrdersPerCustomer', 'CustomerLifetimeValue']].head()

Unnamed: 0,OrdersPerCustomer,CustomerLifetimeValue
0,6,479184.77
1,6,423637.77
2,7,845917.45
3,7,1014337.69
4,3,203380.2


# Save Feature-Engineered Data

In [12]:
df.to_csv("../data/cleaned/sales_data_features.csv", index=False)