## ⚠️ Important Note

This is a starter notebook provided by Zindi (the competition site), the original can be found [here](https://zindi.africa/competitions/ibm-skillsbuild-hydropower-climate-optimisation-challenge/data) along with the data used for this challenge.

In [None]:
import pandas as pd
import numpy as np
import zipfile
import matplotlib.pyplot as plt
import gc
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
all_data_df = pd.read_csv("./data/Data.csv")
ss = pd.read_csv("./data/SampleSubmission.csv")

In [None]:
all_data_df.info()

In [None]:
all_data_df.head()

In [None]:
len(all_data_df["Source"].unique())

In [None]:
ss.head()

In [None]:
ss[["date", "source"]] = ss["ID"].str.split("_", expand=True, n=1)

len(ss["source"].unique())

In [None]:
# Split 'Source' into 'consumer_device_X' and 'data_user_Y'
all_data_df[['consumer_device', 'data_user']] = all_data_df['Source'].str.extract(r'(consumer_device_\d+)_data_user_(\d+)')

# Display the updated DataFrame (optional)
all_data_df.head()

In [None]:
# These are the devices that are not in the test SampleSubmission

devices_to_drop = ["consumer_device_3","consumer_device_5","consumer_device_11", "consumer_device_14",
                   "consumer_device_15", "consumer_device_17", "consumer_device_24",
                   "consumer_device_25","consumer_device_27","consumer_device_33","consumer_device_4","consumer_device_9"]

In [None]:
# Filter the DataFrame to include only rows where 'consumer_device' is in the 'devices_to_drop' list.
filtered_df = all_data_df[all_data_df['consumer_device'].isin(devices_to_drop)]

# Now 'filtered_df' contains only the rows you specified.  You can further process or save this DataFrame.
filtered_df.head()

In [None]:
# for all_data["Source"] aggregate by sum on day

import pandas as pd
# Assuming 'all_data_df' is already defined as in your previous code.
# Convert 'Datetime' column to datetime objects if it's not already
all_data_df['date_time'] = pd.to_datetime(all_data_df['date_time'])

# Extract the date part
all_data_df['Date'] = all_data_df['date_time'].dt.date

# Group by 'Source' and 'Date', then sum the 'Load' for each group
aggregated_data = all_data_df.groupby(['Source', 'Date'])['kwh'].sum().reset_index()

# Display the aggregated data
aggregated_data.head()


In [None]:
# Filter data for consumer_device_10
consumer_10_data = aggregated_data[aggregated_data['Source'].str.contains('consumer_device_10')]

# Create the plot
plt.figure(figsize=(12, 6))
for data_user in consumer_10_data['Source'].unique():
    user_data = consumer_10_data[consumer_10_data['Source'] == data_user]
    plt.plot(user_data['Date'], user_data['kwh'], label=data_user)

plt.xlabel('Date')
plt.ylabel('kwh')
plt.title('kwh Consumption for consumer_device_10 per data_user')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Find the minimum and maximum date_time values
min_date = aggregated_data['Date'].min()
max_date = aggregated_data['Date'].max()

print(f"Minimum date_time: {min_date}")
print(f"Maximum date_time: {max_date}")

In [None]:
# Fill missing date values with 0 kwh

# Create a date range
date_rng = pd.date_range(start=min_date, end=max_date, freq='D')

# Create an empty DataFrame to store the complete data
complete_data = pd.DataFrame()

# Iterate through each unique 'Source'
for source in aggregated_data['Source'].unique():
    # Extract data for the current 'Source'
    source_data = aggregated_data[aggregated_data['Source'] == source].copy()

    # Convert the source data Date to match the type of date_rng
    source_data['Date'] = pd.to_datetime(source_data['Date'])

    # Create a complete date range for the current 'Source'
    source_date_rng = pd.DataFrame({'Date': date_rng})
    source_date_rng['Source'] = source

    # Merge with the existing data, filling missing 'kwh' values with 0
    source_data = pd.merge(source_date_rng, source_data, on=['Date', 'Source'], how='left')
    source_data['kwh'] = source_data['kwh'].fillna(0)

    # Append to the complete data
    complete_data = pd.concat([complete_data, source_data], ignore_index=True)

In [None]:
complete_data.head()

In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA

# Function to process and forecast per unique consumer_device_x and data_user_y
def forecast_arima(all_data, forecast_horizon=30, output_template=None):
    # Convert Date column to datetime format
    all_data['Date'] = pd.to_datetime(all_data['Date'])

    # Extract consumer_device_x and data_user_y
    all_data[['consumer_device', 'data_user']] = all_data['Source'].str.extract(r'consumer_device_(\d+)_data_user_(\d+)')

    # Ensure data is sorted by consumer_device, data_user, and Date
    all_data = all_data.sort_values(by=['consumer_device', 'data_user', 'Date'])

    # Store forecasts
    forecast_results = []

    # Process each unique consumer_device_x and data_user_y combination
    for (consumer_device, data_user), group in all_data.groupby(["consumer_device", "data_user"]):
        # Set Date as index
        group = group.set_index("Date")

        # Ensure data is in the correct format
        group = group.asfreq('D').fillna(method='ffill')  # Fill missing dates with last known value

        # Fit ARIMA model
        try:
            model = ARIMA(group["kwh"], order=(5, 1, 0))  # ARIMA(5,1,0) as a baseline
            fitted_model = model.fit()

            # Forecast for the next forecast_horizon days
            forecast_dates = pd.date_range(start=group.index[-1] + pd.Timedelta(days=1),
                                           periods=forecast_horizon, freq='D')
            forecast_values = fitted_model.forecast(steps=forecast_horizon)

            # Store results in required format
            forecast_df = pd.DataFrame({
                "ID": [f"{date.strftime('%Y-%m-%d')}_consumer_device_{consumer_device}_data_user_{data_user}"
                        for date in forecast_dates],
                "kwh": forecast_values
            })

            forecast_results.append(forecast_df)

        except Exception as e:
            print(f"Error processing {consumer_device}_{data_user}: {e}")

    # Combine all forecasts into a single DataFrame
    forecast_df = pd.concat(forecast_results, ignore_index=True)

    # If an output template is provided, align the output format
    if output_template is not None:
        output_template = output_template.drop(columns=['kwh'], errors='ignore')
        final_output = output_template.merge(forecast_df, on='ID', how='left').fillna(0)
    else:
        final_output = forecast_df

    return final_output


In [None]:
forecast = forecast_arima(all_data=complete_data, forecast_horizon=30, output_template=ss)

In [None]:
forecast.head()

In [None]:
# prompt: does forecast["kwh"] contain nans if so replace with 0

# Check for NaN values in the 'kwh' column and replace them with 0
forecast["kwh"] = forecast["kwh"].fillna(0)


In [None]:
len(complete_data), len(forecast), len(ss)

In [None]:
forecast[["ID", "kwh"]].to_csv("./submissions/starter_notebook_forecast.csv", index = False)

In [None]:
# prompt: list the difference in the ID between forecast and ss

# Assuming 'forecast' and 'ss' DataFrames are already defined as in your provided code.

# Convert 'ID' columns to sets for efficient comparison
forecast_ids = set(forecast['ID'])
ss_ids = set(ss['ID'])

# Find IDs present in forecast but not in ss
forecast_only_ids = forecast_ids - ss_ids

# Find IDs present in ss but not in forecast
ss_only_ids = ss_ids - forecast_ids

# Print the IDs that are in forecast but not in ss
print("IDs in 'forecast' but not in 'ss':")
print(forecast_only_ids)


# Print the IDs that are in ss but not in forecast
print("\nIDs in 'ss' but not in 'forecast':")
print(ss_only_ids)

# Print the number of IDs that differ
print(f"\nNumber of IDs that differ: {len(forecast_only_ids) + len(ss_only_ids)}")
