# Data Cleaning & Validation
This notebook demonstrates cleaning, standardizing, and validating the synthetic orders dataset.

In [1]:
import pandas as pd

# Load raw data
df = pd.read_csv('../data/raw/orders_raw.csv')
df.head()

Unnamed: 0,order_id,order_date,ship_date,delivery_date,region,warehouse,product_category,unit_price,quantity,order_status,delay_reason,carrier
0,ORD-ef431f0e,2024-01-29,2024-01-30,2024-02-07,North,WH-C,Apparel,37.23,1,Delivered,,Carrier B
1,ORD-84a55547,2024-07-27,2024-07-28,2024-08-01,North,WH-A,Accessories,2.54,2,Delivered,,Carrier D
2,ORD-23bf0753,2024-10-14,2024-10-16,2024-10-18,North,WH-B,Apparel,109.47,1,Delivered,,Carrier A
3,ORD-ee11e9ee,2024-12-08,2024-12-10,2024-12-17,Central,WH-D,Apparel,68.75,1,Delivered,,Carrier D
4,ORD-18d1fd7a,2024-12-18,2024-12-19,2024-12-28,North,WH-B,Office,46.49,1,Delivered,,Carrier C


In [2]:
# Parse dates
for c in ['order_date','ship_date','delivery_date']:
    df[c] = pd.to_datetime(df[c], errors='coerce')

# Remove duplicates
df = df.drop_duplicates(subset=['order_id'])

# Standardize string columns
df['region'] = df['region'].str.title().fillna('Unknown')
df['warehouse'] = df['warehouse'].str.upper().fillna('UNKNOWN')
df['product_category'] = df['product_category'].str.title().fillna('Unknown')
df['carrier'] = df['carrier'].fillna('Unknown')

# Create derived columns
df['order_to_ship_days'] = (df['ship_date'] - df['order_date']).dt.days
df['ship_to_delivery_days'] = (df['delivery_date'] - df['ship_date']).dt.days
df['order_to_delivery_days'] = (df['delivery_date'] - df['order_date']).dt.days
df['is_late'] = (df['order_to_delivery_days']>7) & (df['order_status']=='Delivered')

df.head()

Unnamed: 0,order_id,order_date,ship_date,delivery_date,region,warehouse,product_category,unit_price,quantity,order_status,delay_reason,carrier,order_to_ship_days,ship_to_delivery_days,order_to_delivery_days,is_late
0,ORD-ef431f0e,2024-01-29,2024-01-30,2024-02-07,North,WH-C,Apparel,37.23,1,Delivered,,Carrier B,1,8.0,9.0,True
1,ORD-84a55547,2024-07-27,2024-07-28,2024-08-01,North,WH-A,Accessories,2.54,2,Delivered,,Carrier D,1,4.0,5.0,False
2,ORD-23bf0753,2024-10-14,2024-10-16,2024-10-18,North,WH-B,Apparel,109.47,1,Delivered,,Carrier A,2,2.0,4.0,False
3,ORD-ee11e9ee,2024-12-08,2024-12-10,2024-12-17,Central,WH-D,Apparel,68.75,1,Delivered,,Carrier D,2,7.0,9.0,True
4,ORD-18d1fd7a,2024-12-18,2024-12-19,2024-12-28,North,WH-B,Office,46.49,1,Delivered,,Carrier C,1,9.0,10.0,True


In [3]:
# Save cleaned dataset
df.to_csv('../data/cleaned/orders_cleaned.csv', index=False)
print('Saved cleaned dataset to ../data/cleaned/orders_cleaned.csv')

Saved cleaned dataset to ../data/cleaned/orders_cleaned.csv
