## 1. Setup & Load Data from Previous Notebook

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

# Load the cleaned data from exploration notebook
try:
    df_clean = pd.read_csv('../data/raw/sales_data.csv')
    df_clean['date'] = pd.to_datetime(df_clean[df_clean.columns[[col for col in df_clean.columns if 'date' in col.lower()][0]]])
    print("✓ Data loaded successfully")
except:
    print("⚠ Could not load data automatically. Please ensure the data is in ../data/raw/")
    print("   You can rerun the previous notebook (01_data_exploration.ipynb) first.")

## 2. Create Time Series Features

In [None]:
# Ensure we have daily aggregated data
sales_col = [col for col in df_clean.columns if 'sales' in col.lower() or 'amount' in col.lower()][0]
date_col = 'date'

# Create daily time series
daily_sales = df_clean.groupby(date_col)[sales_col].sum().reset_index()
daily_sales.columns = ['date', 'sales']
daily_sales['date'] = pd.to_datetime(daily_sales['date'])
daily_sales = daily_sales.sort_values('date').reset_index(drop=True)

# Create complete date range (fill missing dates)
date_range = pd.date_range(start=daily_sales['date'].min(), end=daily_sales['date'].max(), freq='D')
daily_sales = daily_sales.set_index('date').reindex(date_range, fill_value=0).reset_index()
daily_sales.columns = ['date', 'sales']

print(f"✓ Created daily time series: {len(daily_sales)} days")
print(f"Date range: {daily_sales['date'].min().date()} to {daily_sales['date'].max().date()}")

## 3. Time-Based Features

In [None]:
# Extract time components
daily_sales['year'] = daily_sales['date'].dt.year
daily_sales['month'] = daily_sales['date'].dt.month
daily_sales['day'] = daily_sales['date'].dt.day
daily_sales['day_of_week'] = daily_sales['date'].dt.dayofweek  # 0=Monday, 6=Sunday
daily_sales['day_name'] = daily_sales['date'].dt.day_name()
daily_sales['quarter'] = daily_sales['date'].dt.quarter
daily_sales['week_of_year'] = daily_sales['date'].dt.isocalendar().week
daily_sales['day_of_year'] = daily_sales['date'].dt.dayofyear

# Create categorical features
daily_sales['is_weekend'] = (daily_sales['day_of_week'] >= 5).astype(int)
daily_sales['is_month_start'] = daily_sales['date'].dt.is_month_start.astype(int)
daily_sales['is_month_end'] = daily_sales['date'].dt.is_month_end.astype(int)
daily_sales['is_quarter_start'] = daily_sales['date'].dt.is_quarter_start.astype(int)
daily_sales['is_quarter_end'] = daily_sales['date'].dt.is_quarter_end.astype(int)
daily_sales['is_year_start'] = daily_sales['date'].dt.is_year_start.astype(int)
daily_sales['is_year_end'] = daily_sales['date'].dt.is_year_end.astype(int)

print("✓ Created time-based features")
print(f"\nFeatures created: {len(daily_sales.columns) - 2} features")
print(daily_sales.head())

## 4. Rolling Window Features

In [None]:
# Rolling averages
daily_sales['rolling_mean_7'] = daily_sales['sales'].rolling(window=7, min_periods=1).mean()
daily_sales['rolling_mean_14'] = daily_sales['sales'].rolling(window=14, min_periods=1).mean()
daily_sales['rolling_mean_30'] = daily_sales['sales'].rolling(window=30, min_periods=1).mean()
daily_sales['rolling_mean_90'] = daily_sales['sales'].rolling(window=90, min_periods=1).mean()
daily_sales['rolling_mean_365'] = daily_sales['sales'].rolling(window=365, min_periods=1).mean()

# Rolling standard deviations (volatility)
daily_sales['rolling_std_7'] = daily_sales['sales'].rolling(window=7, min_periods=1).std()
daily_sales['rolling_std_30'] = daily_sales['sales'].rolling(window=30, min_periods=1).std()
daily_sales['rolling_std_90'] = daily_sales['sales'].rolling(window=90, min_periods=1).std()

# Rolling min and max
daily_sales['rolling_min_7'] = daily_sales['sales'].rolling(window=7, min_periods=1).min()
daily_sales['rolling_max_7'] = daily_sales['sales'].rolling(window=7, min_periods=1).max()
daily_sales['rolling_range_7'] = daily_sales['rolling_max_7'] - daily_sales['rolling_min_7']

print("✓ Created rolling window features")
print(f"Rolling features: 7-day, 14-day, 30-day, 90-day, 365-day windows")

## 5. Lag Features

In [None]:
# Lag features (previous day, week, month sales)
for lag in [1, 7, 14, 30]:
    daily_sales[f'lag_{lag}'] = daily_sales['sales'].shift(lag)

# Year-over-year lag
daily_sales['lag_365'] = daily_sales['sales'].shift(365)

# Differences (rate of change)
daily_sales['diff_1'] = daily_sales['sales'].diff(1)
daily_sales['diff_7'] = daily_sales['sales'].diff(7)
daily_sales['diff_30'] = daily_sales['sales'].diff(30)

# Percentage change
daily_sales['pct_change_1'] = daily_sales['sales'].pct_change(1)
daily_sales['pct_change_7'] = daily_sales['sales'].pct_change(7)

print("✓ Created lag features")
print(f"Lags: 1, 7, 14, 30, 365 days")

## 6. Seasonal Decomposition Features

In [None]:
# Monthly aggregated features (seasonality by month)
monthly_avg = daily_sales.groupby('month')['sales'].mean().to_dict()
daily_sales['monthly_avg'] = daily_sales['month'].map(monthly_avg)
daily_sales['monthly_seasonality'] = daily_sales['sales'] / daily_sales['monthly_avg']

# Day-of-week seasonality
dow_avg = daily_sales.groupby('day_of_week')['sales'].mean().to_dict()
daily_sales['dow_avg'] = daily_sales['day_of_week'].map(dow_avg)
daily_sales['dow_seasonality'] = daily_sales['sales'] / daily_sales['dow_avg']

# Quarter seasonality
quarter_avg = daily_sales.groupby('quarter')['sales'].mean().to_dict()
daily_sales['quarter_avg'] = daily_sales['quarter'].map(quarter_avg)
daily_sales['quarter_seasonality'] = daily_sales['sales'] / daily_sales['quarter_avg']

print("✓ Created seasonal decomposition features")
print("Features: monthly_seasonality, dow_seasonality, quarter_seasonality")

## 7. Holiday & Special Event Features

In [None]:
# Define holidays (customize based on your region/business)
# Format: (month, day) tuple
holidays = {
    'new_year': (1, 1),
    'valentine_day': (2, 14),
    'march_madness': (3, 15),  # Approximate
    'independence_day': (7, 4),
    'labor_day': (9, 1),  # First Monday of September
    'black_friday': (11, 27),  # Varies by year
    'cyber_monday': (11, 30),  # Varies by year
    'christmas': (12, 25),
    'boxing_day': (12, 26),
}

# Create holiday flags
for holiday_name, (month, day) in holidays.items():
    daily_sales[f'is_{holiday_name}'] = (
        (daily_sales['month'] == month) & (daily_sales['day'] == day)
    ).astype(int)

# Days before/after major holidays
major_holidays = ['christmas', 'black_friday']
for holiday in major_holidays:
    # Create holiday indicator window (3 days before and after)
    holiday_mask = daily_sales[f'is_{holiday}'] == 1
    daily_sales[f'{holiday}_window'] = (
        holiday_mask | 
        holiday_mask.shift(1) | 
        holiday_mask.shift(2) | 
        holiday_mask.shift(3) |
        holiday_mask.shift(-1) | 
        holiday_mask.shift(-2) | 
        holiday_mask.shift(-3)
    ).astype(int)

print("✓ Created holiday features")
print(f"Holidays tracked: {', '.join(holidays.keys())}")

## 8. Trend Features

In [None]:
# Calculate trend using linear regression on rolling windows
from scipy.stats import linregress

def calculate_trend(series, window=30):
    """Calculate trend coefficient for rolling window"""
    trends = []
    for i in range(len(series)):
        if i < window:
            x = np.arange(i + 1)
            y = series.iloc[:i+1].values
        else:
            x = np.arange(window)
            y = series.iloc[i-window+1:i+1].values
        
        if len(x) > 1:
            slope, _, _, _, _ = linregress(x, y)
            trends.append(slope)
        else:
            trends.append(0)
    
    return pd.Series(trends, index=series.index)

# Calculate trend features
daily_sales['trend_7'] = calculate_trend(daily_sales['sales'], window=7)
daily_sales['trend_30'] = calculate_trend(daily_sales['sales'], window=30)
daily_sales['trend_90'] = calculate_trend(daily_sales['sales'], window=90)

# Days since trend change
daily_sales['days_increasing'] = (
    (~(daily_sales['diff_1'] < 0)).cumsum() * (daily_sales['diff_1'] >= 0).astype(int)
)

print("✓ Created trend features")

## 9. Feature Summary & Visualization

In [None]:
# Display feature summary
print("\n=== FEATURE ENGINEERING SUMMARY ===")
print(f"\nTotal features created: {len(daily_sales.columns) - 2}")
print(f"Dataset shape: {daily_sales.shape}")
print(f"\nFeature Categories:")
print(f"  - Time components: 8 features")
print(f"  - Rolling windows: 12 features")
print(f"  - Lag features: 8 features")
print(f"  - Seasonal features: 6 features")
print(f"  - Holiday features: {sum(1 for col in daily_sales.columns if 'holiday' in col.lower() or 'is_' in col)}")
print(f"  - Trend features: 4 features")

print(f"\nFirst few rows with features:")
print(daily_sales.head(10))

In [None]:
# Visualize feature correlations with target
# Select key features for visualization
key_features = [
    'sales',
    'rolling_mean_7',
    'rolling_mean_30',
    'lag_7',
    'lag_30',
    'trend_30',
    'is_weekend'
]

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Rolling averages
axes[0, 0].plot(daily_sales['date'], daily_sales['sales'], label='Daily Sales', alpha=0.5, linewidth=1)
axes[0, 0].plot(daily_sales['date'], daily_sales['rolling_mean_7'], label='7-day MA', linewidth=2)
axes[0, 0].plot(daily_sales['date'], daily_sales['rolling_mean_30'], label='30-day MA', linewidth=2)
axes[0, 0].set_title('Rolling Averages', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Sales')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Trend
axes[0, 1].plot(daily_sales['date'], daily_sales['trend_30'], linewidth=2, color='green')
axes[0, 1].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[0, 1].fill_between(daily_sales['date'], 0, daily_sales['trend_30'], 
                         where=(daily_sales['trend_30'] >= 0), alpha=0.3, color='green', label='Uptrend')
axes[0, 1].fill_between(daily_sales['date'], 0, daily_sales['trend_30'], 
                         where=(daily_sales['trend_30'] < 0), alpha=0.3, color='red', label='Downtrend')
axes[0, 1].set_title('30-Day Trend', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Trend Coefficient')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Weekend vs Weekday
weekend_sales = daily_sales[daily_sales['is_weekend'] == 1]['sales'].mean()
weekday_sales = daily_sales[daily_sales['is_weekend'] == 0]['sales'].mean()
axes[1, 0].bar(['Weekday', 'Weekend'], [weekday_sales, weekend_sales], color=['steelblue', 'coral'], edgecolor='black')
axes[1, 0].set_title('Average Sales: Weekday vs Weekend', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('Average Sales')
for i, v in enumerate([weekday_sales, weekend_sales]):
    axes[1, 0].text(i, v + 1, f'${v:.0f}', ha='center', va='bottom')

# Lag correlation
lag_correlations = [
    daily_sales['sales'].corr(daily_sales['lag_1']),
    daily_sales['sales'].corr(daily_sales['lag_7']),
    daily_sales['sales'].corr(daily_sales['lag_14']),
    daily_sales['sales'].corr(daily_sales['lag_30']),
]
axes[1, 1].bar(['1-day', '7-day', '14-day', '30-day'], lag_correlations, color='lightgreen', edgecolor='black')
axes[1, 1].set_title('Lag Feature Correlations with Sales', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Correlation')
axes[1, 1].set_ylim(0, 1)
for i, v in enumerate(lag_correlations):
    axes[1, 1].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.savefig('../outputs/03_feature_engineering.png', dpi=300, bbox_inches='tight')
plt.show()
print("✓ Saved: 03_feature_engineering.png")

## 10. Handle Missing Values & Save Engineered Data

In [None]:
# Handle missing values created by lag and rolling features
print(f"Missing values before imputation: {daily_sales.isnull().sum().sum()}")

# Forward fill then backward fill for missing values
daily_sales_filled = daily_sales.fillna(method='bfill').fillna(method='ffill').fillna(0)

print(f"Missing values after imputation: {daily_sales_filled.isnull().sum().sum()}")
print("\n✓ All missing values handled")

In [None]:
# Save the engineered features
daily_sales_filled.to_csv('../data/processed/engineered_features.csv', index=False)
print(f"✓ Saved engineered features to: data/processed/engineered_features.csv")
print(f"\nDataset ready for modeling:")
print(f"  - Rows: {len(daily_sales_filled)}")
print(f"  - Columns: {len(daily_sales_filled.columns)}")
print(f"  - Date range: {daily_sales_filled['date'].min()} to {daily_sales_filled['date'].max()}")
print(f"\n✓ Feature engineering complete! Ready for model training.")