In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os

class SyntheticGasDataGenerator:
    """
    Generate synthetic gas usage data by scaling Kenny's baseline patterns
    """

    def __init__(self, kenny_baseline_path):
        """
        Load Kenny's baseline data (2000 sqft, 3 people, furnace+stove)
        """
        self.kenny_summer = pd.read_csv(f'{kenny_baseline_path}/kenny_aug_2023.csv')
        self.kenny_winter = pd.read_csv(f'{kenny_baseline_path}/kenny_jan_2024.csv')

        # Combine date and time into datetime
        for df in [self.kenny_summer, self.kenny_winter]:
            df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
            df['hour'] = df['datetime'].dt.hour

        # Baseline parameters
        self.baseline_sqft = 2000
        self.baseline_occupancy = 3
        self.baseline_heating_rate = 0.12  # therms/hour at 50°F for 2000 sqft

    def calculate_heating_multiplier(self, temp_f):
        """Calculate heating multiplier based on temperature"""
        if temp_f >= 70:
            return 0.0
        elif temp_f >= 60:
            return 0.3
        elif temp_f >= 50:
            return 0.6
        elif temp_f >= 40:
            return 0.9
        else:
            return 1.2

    def scale_heating(self, baseline_usage, target_sqft, temp_f):
        """
        Scale heating usage based on square footage and temperature
        Only scales the heating portion, not cooking
        """
        sqft_ratio = target_sqft / self.baseline_sqft

        # Estimate heating vs cooking in winter baseline
        # If temp < 70, assume usage has heating component
        if temp_f < 70:
            # Use temperature to estimate heating portion
            baseline_heating_multiplier = self.calculate_heating_multiplier(temp_f)
            baseline_heating = self.baseline_heating_rate * baseline_heating_multiplier

            # Separate heating from total usage (rough estimation)
            cooking_component = max(0, baseline_usage - baseline_heating)
            heating_component = baseline_usage - cooking_component

            # Scale heating only
            scaled_heating = heating_component * sqft_ratio
            return scaled_heating + cooking_component
        else:
            # No heating, return as-is (summer or warm day)
            return baseline_usage

    def scale_cooking(self, baseline_usage, target_occupancy, hour):
        """
        Scale cooking usage based on occupancy
        Cooking happens at specific hours (7am, 6-8pm typically)
        """
        # Identify if this hour likely has cooking
        cooking_hours = [7, 11, 18, 19, 20]

        if hour not in cooking_hours or baseline_usage == 0:
            return baseline_usage

        # Occupancy scaling factors (non-linear)
        occupancy_factors = {
            1: 0.6,
            2: 0.8,
            3: 1.0,  # baseline
            4: 1.2,
            5: 1.3
        }

        factor = occupancy_factors.get(target_occupancy, 1.0)
        return baseline_usage * factor

    def scale_water_heating(self, target_occupancy, hour):
        """
        Add water heating usage based on occupancy
        Showers typically at 7-8am and 7-9pm
        """
        shower_hours = [7, 8, 19, 20, 21]

        if hour not in shower_hours:
            return 0.0

        # Each person showers once per day
        # Distribute across morning/evening
        showers_per_hour = target_occupancy / len(shower_hours)
        return showers_per_hour * 0.10  # 0.10 therms per shower

    def add_dryer_usage(self, df, target_occupancy):
        """
        Add gas dryer usage events
        Frequency based on occupancy: (occupancy/2) loads per week
        """
        loads_per_week = target_occupancy / 2
        days_between_loads = 7 / loads_per_week

        # Add dryer loads at evening hours
        dryer_hours = [19, 20, 21]  # 7-9pm

        current_day = 0
        next_dryer_day = 0

        for idx, row in df.iterrows():
            day = row['datetime'].day
            hour = row['hour']

            if day != current_day:
                current_day = day

            # Add dryer load on scheduled days at random evening hour
            if day >= next_dryer_day and hour in dryer_hours:
                if np.random.random() < 0.3:  # 30% chance this hour
                    df.at[idx, 'usage_therms'] += 0.30
                    next_dryer_day = day + days_between_loads
                    break

        return df

    def generate_summer_variation(self, target_occupancy, appliances, output_path):
        """
        Generate summer data with occupancy scaling
        Fixed at 2000 sqft (Kenny's baseline)
        """
        # Copy Kenny's summer baseline
        df = self.kenny_summer.copy()

        # Scale cooking based on occupancy
        if 'stove' in appliances:
            df['usage_therms'] = df.apply(
                lambda row: self.scale_cooking(row['usage_therms'], target_occupancy, row['hour']),
                axis=1
            )
        else:
            # Remove cooking if no stove
            cooking_hours = [7, 11, 18, 19, 20]
            df.loc[df['hour'].isin(cooking_hours), 'usage_therms'] = 0.0

        # Add water heating if present
        if 'water_heater' in appliances:
            df['usage_therms'] += df['hour'].apply(
                lambda h: self.scale_water_heating(target_occupancy, h)
            )

        # Add dryer if present
        if 'dryer' in appliances:
            df = self.add_dryer_usage(df, target_occupancy)

        # Update metadata
        df['occupancy'] = target_occupancy
        df['appliances'] = appliances

        # Recalculate avg_usage
        daily_totals = df.groupby(df['datetime'].dt.date)['usage_therms'].sum()
        df['avg_usage'] = df['datetime'].dt.date.map(daily_totals) / 24

        # Add noise (±10%)
        noise = np.random.normal(1.0, 0.10, len(df))
        df['usage_therms'] = (df['usage_therms'] * noise).clip(lower=0)
        df['usage_therms'] = df['usage_therms'].round(4)

        # Save
        df.to_csv(output_path, index=False)
        print(f"Generated: {output_path}")
        return df

    def generate_winter_variation(self, target_sqft, appliances, output_path):
        """
        Generate winter data with square footage scaling
        Fixed at 3 people (Kenny's baseline)
        """
        # Copy Kenny's winter baseline
        df = self.kenny_winter.copy()

        # Scale heating based on square footage
        df['usage_therms'] = df.apply(
            lambda row: self.scale_heating(row['usage_therms'], target_sqft, row['temp']),
            axis=1
        )

        # Handle appliance variations
        if 'stove' not in appliances:
            # Remove cooking peaks
            cooking_hours = [7, 18, 19]
            # Reduce usage at cooking hours (remove cooking component)
            for hour in cooking_hours:
                df.loc[df['hour'] == hour, 'usage_therms'] *= 0.8

        if 'water_heater' in appliances:
            # Add water heating (baseline already has some from Kenny's data)
            pass  # Kenny's data likely includes water heating

        if 'dryer' in appliances:
            df = self.add_dryer_usage(df, self.baseline_occupancy)

        # Update metadata
        df['home_sqft'] = target_sqft
        df['appliances'] = appliances

        # Recalculate avg_usage
        daily_totals = df.groupby(df['datetime'].dt.date)['usage_therms'].sum()
        df['avg_usage'] = df['datetime'].dt.date.map(daily_totals) / 24

        # Add noise (±10%)
        noise = np.random.normal(1.0, 0.10, len(df))
        df['usage_therms'] = (df['usage_therms'] * noise).clip(lower=0)
        df['usage_therms'] = df['usage_therms'].round(4)

        # Save
        df.to_csv(output_path, index=False)
        print(f"Generated: {output_path}")
        return df

    def generate_all_variations(self, output_base_path):
        """
        Generate all common variations from Kenny's baseline
        """
        os.makedirs(output_base_path, exist_ok=True)

        print("Generating Summer Variations (Occupancy Scaling)...")
        print("=" * 60)

        # Summer variations - different occupancies
        appliance_combos = [
            'furnace+stove',
            'furnace+water_heater+stove',
            'furnace+dryer',
            'furnace+water_heater+stove+dryer'
        ]

        for occupancy in [1, 2, 3, 4, 5]:
            for appliances in appliance_combos:
                output_file = f"{output_base_path}/summer_{occupancy}people_{appliances.replace('+', '_')}.csv"
                self.generate_summer_variation(occupancy, appliances, output_file)

        print("\nGenerating Winter Variations (Square Footage Scaling)...")
        print("=" * 60)

        # Winter variations - different square footages
        for sqft in [1000, 1200, 1400, 1600, 1800, 2000, 2200, 2400, 2600, 2800, 3000]:
            for appliances in appliance_combos:
                output_file = f"{output_base_path}/winter_{sqft}sqft_{appliances.replace('+', '_')}.csv"
                self.generate_winter_variation(sqft, appliances, output_file)

        print("\n" + "=" * 60)
        print("Generation Complete!")
        print(f"All variations saved to: {output_base_path}/")

# Example usage
if __name__ == "__main__":
    # Initialize with Kenny's baseline data path
    generator = SyntheticGasDataGenerator('kenny_baseline_data')

    # Generate specific variations
    print("Generating sample variations...\n")

    # Summer: 2 people, furnace+stove
    generator.generate_summer_variation(
        target_occupancy=2,
        appliances='furnace+stove',
        output_path='synthetic_output/summer_2people_furnace_stove.csv'
    )

    # Winter: 1800 sqft, furnace+stove
    generator.generate_winter_variation(
        target_sqft=1800,
        appliances='furnace+stove',
        output_path='synthetic_output/winter_1800sqft_furnace_stove.csv'
    )

    # To generate ALL variations, uncomment:
    # generator.generate_all_variations('synthetic_output')