## FEATURE ENGINEERING

In [35]:
import pandas as pd
import numpy as np

df = pd.read_csv(r"E:\food data set\archive (1)\food_wastage_data.csv")

print("="*70)
print("FEATURE ENGINEERING - Creating New Features")
print("="*70)

# 1. Wastage per Guest
if "Number of Guests" in df.columns:
    df['Wastage_per_guest'] = df["Wastage Food Amount"] / (df["Number of Guests"] + 1)
    print("\nCreated: Wastage_per_guest")

# 2. Wastage percentage 
if "Quantity of Food" in df.columns:
    df['Wastage_Percentage'] = (df["Wastage Food Amount"] / df['Quantity of Food']) * 100
    print("\nCreated: Wastage_percentage")

# 3. Food per guest 
if "Number of Guests" in df.columns and "Quantity of Food" in df.columns:
    df["Food per guest"] = df["Quantity of Food"] / (df["Number of Guests"] + 1)
    print("\nCreated : Food_per_guest")

# 4. Wastage Category (Low, Medium, High)
wastage_col = "Quantity of Food"
df["Wastage category"] = pd.cut(df[wastage_col], 
                                bins=[0, 25, 40, 100],
                                labels=['Low', 'Medium', 'High'])
print("\nCreated: Wastage_Category")

# 5. Guest Category
if 'Number of Guests' in df.columns:
    df['Event_Size'] = pd.cut(df['Number of Guests'],
                              bins=[0, 10, 30, 100],
                              labels=['Small', 'Medium', 'Large'])
print("\nCreated: Event_Size")

print(df.columns.tolist())


print("\n" + "="*70)
print(f"\nTotal columns now: {df.shape[1]}")
print(df[['Wastage Food Amount', 'Wastage_per_guest', 'Wastage_Percentage','Wastage category', 'Event_Size']].head())

FEATURE ENGINEERING - Creating New Features

Created: Wastage_per_guest

Created: Wastage_percentage

Created : Food_per_guest

Created: Wastage_Category

Created: Event_Size
['Type of Food', 'Number of Guests', 'Event Type', 'Quantity of Food', 'Storage Conditions', 'Purchase History', 'Seasonality', 'Preparation Method', 'Geographical Location', 'Pricing', 'Wastage Food Amount', 'Wastage_per_guest', 'Wastage_Percentage', 'Food per guest', 'Wastage category', 'Event_Size']


Total columns now: 16
   Wastage Food Amount  Wastage_per_guest  Wastage_Percentage  \
0                   25           0.080386            5.555556   
1                   40           0.099751            8.000000   
2                   27           0.089109            7.277628   
3                   32           0.065041            6.438632   
4                   25           0.083056            6.250000   

  Wastage category Event_Size  
0              NaN        NaN  
1              NaN        NaN  
2         

## Feature Selection


In [42]:
# Calculate correlation with target for ALL features (including new ones)
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
target_col = 'Wastage Food Amount'

# Remove target from features list

features = [col for col in numerical_cols if col != target_col]

# Calculations correlations 
correlations = {}
for features in features:
    corr = df[features].corr(df[target_col])
    correlations[features] = abs(corr) 

# Sort by importance
feature_importance = sorted(correlations.items(), key=lambda x: x[1], reverse=True)

print("\n" + "="*70)
print("FEATURE IMPORTANCE RANKING")
print("="*70)
print("\nFeatures ranked by correlation with target:\n")
for i, (feature, importance) in enumerate(feature_importance, 1):
    stars = "⭐" * int(importance * 10)
    print(f"{i:2d}. {feature:30s} | Correlation: {importance:.3f} {stars}")


# Select top features
top_features = [feat for feat, corr in feature_importance if corr > 0.3]
print(f"\n Top Features Selected: {top_features}")


FEATURE IMPORTANCE RANKING

Features ranked by correlation with target:

 1. Wastage_Percentage             | Correlation: 0.908 ⭐⭐⭐⭐⭐⭐⭐⭐⭐
 2. Wastage_per_guest              | Correlation: 0.810 ⭐⭐⭐⭐⭐⭐⭐⭐
 3. Number of Guests               | Correlation: 0.650 ⭐⭐⭐⭐⭐⭐
 4. Quantity of Food               | Correlation: 0.639 ⭐⭐⭐⭐⭐⭐
 5. Food per guest                 | Correlation: 0.240 ⭐⭐

 Top Features Selected: ['Wastage_Percentage', 'Wastage_per_guest', 'Number of Guests', 'Quantity of Food']
