In [1]:
# 03_feature_engine.ipynb

import pandas as pd
import numpy as np
df = pd.read_csv('../data_processed/food_orders_cleaned.csv')


Step 1: Date & time conversion

In [2]:
# Real dataset cols: Time_Orderd (e.g., '19:50:00'), no full Timestamp
if 'Time_Orderd' in df.columns:
    # Extract hour from Time_Orderd (string -> hour)
    df['Order_Hour'] = pd.to_numeric(df['Time_Orderd'].str[:2], errors='coerce')
elif 'Time_Order_picked' in df.columns:
    df['Order_Hour'] = pd.to_numeric(df['Time_Order_picked'].str[:2], errors='coerce')
else:
    # Mock fallback
    df['Order_Hour'] = np.random.randint(8, 23, len(df))  # 8AM-11PM realistic

df['Order_Hour'] = df['Order_Hour'].fillna(12).astype(int)  # Lunch default
print("Order_Hour sample:", df['Order_Hour'].head(10).tolist())
print("Hour distribution:\n", pd.Series(df['Order_Hour']).value_counts().sort_index().head())

# Mock Order_Date (no date col in Kaggle → generate sequential)
df['Order_Date'] = pd.date_range(start='2024-01-01', periods=len(df), freq='15min').floor('D')
print("Order_Date sample:", df['Order_Date'].head())


Order_Hour sample: [22, 22, 9, 18, 13, 21, 15, 8, 10, 14]
Hour distribution:
 Order_Hour
8     6742
9     6710
10    6419
11    6570
12    6673
Name: count, dtype: int64
Order_Date sample: 0   2024-01-01
1   2024-01-01
2   2024-01-01
3   2024-01-01
4   2024-01-01
Name: Order_Date, dtype: datetime64[ns]


In [3]:
df['Order_Time'].head()

0    0:00
1    0:00
2    0:00
3    0:00
4    0:00
Name: Order_Time, dtype: object

In [4]:
# Step 2: Order_Day_Type (Run this cell now)
df['Day_Name'] = df['Order_Date'].dt.day_name()
df['Order_Day_Type'] = np.where(df['Day_Name'].isin(['Saturday','Sunday']), 'Weekend', 'Weekday')
print("Day_Type distribution:\n", df['Order_Day_Type'].value_counts())

Day_Type distribution:
 Order_Day_Type
Weekday    71520
Weekend    28480
Name: count, dtype: int64


In [5]:
# Step 3: Peak_Hour (Run next)
peak_hours = [11,12,13,19,20,21,22]
df['Peak_Hour'] = df['Order_Hour'].isin(peak_hours).astype(int)
print("Peak_Hour: True % =", (df['Peak_Hour'].sum() / len(df) * 100).round(1), "%")

Peak_Hour: True % = 46.7 %


In [6]:
# Step 4: Age_Group (Mock age → realistic)
df['Customer_Age'] = np.clip(np.random.normal(32, 10, len(df)).astype(int), 18, 65)  # Healthcare pro realistic
bins = [0,25,35,50,100]
labels = ['Young','Adult','Middle','Senior']
df['Age_Group'] = pd.cut(df['Customer_Age'], bins=bins, labels=labels, right=False)
print("Age_Group:\n", df['Age_Group'].value_counts())

Age_Group:
 Age_Group
Adult     37848
Middle    34646
Young     23977
Senior     3529
Name: count, dtype: int64


In [7]:
# Step 5: Profit_Margin_Pct
df['Order_Value'] = df.get('Price_of_total_order', np.random.uniform(50,500,len(df))).round(2)
df['Profit_Margin'] = df['Order_Value'] * 0.20  # Platform 20%
df['Profit_Margin_Pct'] = 20.0
print("Profit sample:", df[['Order_Value','Profit_Margin','Profit_Margin_Pct']].head())


Profit sample:    Order_Value  Profit_Margin  Profit_Margin_Pct
0       416.40         83.280               20.0
1       291.79         58.358               20.0
2       154.90         30.980               20.0
3       359.60         71.920               20.0
4       211.26         42.252               20.0


In [8]:
# Step 6: Save & Verify
df.to_csv('../data_processed/food_orders_features.csv', index=False)
print("✅ 03 COMPLETE! Features file ready for EDA/SQL/PowerBI")
print("New cols added:", ['Order_Day_Type','Peak_Hour','Age_Group','Profit_Margin_Pct'])

✅ 03 COMPLETE! Features file ready for EDA/SQL/PowerBI
New cols added: ['Order_Day_Type', 'Peak_Hour', 'Age_Group', 'Profit_Margin_Pct']
