In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import plotly.graph_objects as go

def create_features(df, date_column, hour_column, predictor_columns):
    # Create the datetime column from date and hour columns
    df['datetime'] = pd.to_datetime(df[date_column]) + pd.to_timedelta(df[hour_column] - 1, unit='h')
    
    # Adjust for hour ending values: if hour is 24, set to 0 and add a day
    df.loc[df[hour_column] == 24, 'datetime'] += pd.Timedelta(days=1)
    df.loc[df[hour_column] == 24, hour_column] = 0
    
    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['hour'] = df['datetime'].dt.hour  # this should now reflect the adjustment
    df['month'] = df['datetime'].dt.month
    features = df[['dayofweek', 'hour', 'month'] + predictor_columns]
    return features

def adjust_predictions_to_target_average(predictions, date_column, target_prediction_column, date, target_average_load):
    predictions['date'] = pd.to_datetime(predictions[date_column]).dt.date
    given_day_predictions = predictions[predictions['date'] == pd.to_datetime(date).date()]
    current_average = given_day_predictions[target_prediction_column].mean()
    total_adjustment_needed = target_average_load - current_average
    adjustment_per_prediction = total_adjustment_needed / len(given_day_predictions)
    
    adjusted_predictions = predictions.copy()
    adjusted_predictions.loc[adjusted_predictions['date'] == pd.to_datetime(date).date(), target_prediction_column] += adjustment_per_prediction
    
    # Return both the adjusted predictions and the subset for the specified day
    return adjusted_predictions, adjusted_predictions[adjusted_predictions['date'] == pd.to_datetime(date).date()]

def visualize_predictions(predictions, adjusted_predictions, date_column, target_prediction_column, date, mae, mse, training_window, target_average_load):
    # Ensure data is sorted by datetime
    predictions = predictions.sort_values(by='datetime')
    adjusted_predictions = adjusted_predictions.sort_values(by='datetime')
    
    predictions['date'] = predictions['datetime'].dt.date
    initial_day_data = predictions[predictions['date'] == pd.to_datetime(date).date()]
    adjusted_day_data = adjusted_predictions[adjusted_predictions['date'] == pd.to_datetime(date).date()]
    
    initial_trace = go.Scatter(x=initial_day_data['datetime'], y=initial_day_data[target_prediction_column],
                               mode='lines+markers', name='Initial Prediction')
    adjusted_trace = go.Scatter(x=adjusted_day_data['datetime'], y=adjusted_day_data[target_prediction_column],
                                mode='lines+markers', name='Adjusted Prediction')
    
    title_text = f"<b>Predictions for {target_prediction_column} on {date}</b><br>MAE: {mae:.2f}, MSE: {mse:.2f}, Inputted Avg: {target_average_load}, Training Window: {training_window}"
    layout = go.Layout(title=title_text, xaxis_title='Time', yaxis_title='Prediction', hovermode='closest')
    fig = go.Figure(data=[initial_trace, adjusted_trace], layout=layout)
    fig.show()

def full_workflow(train_data, test_data, date_column, hour_column, target_column, predictor_columns, target_average_load, date):
    train_features = create_features(train_data, date_column, hour_column, predictor_columns)
    test_features = create_features(test_data, date_column, hour_column, predictor_columns)
    train_target = train_data[target_column]
    
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(train_features, train_target)
    
    test_data['datetime'] = pd.to_datetime(test_data[date_column]) + pd.to_timedelta(test_data[hour_column] - 1, unit='h')
    # Adjust for HE 24 in test data as well
    test_data.loc[test_data[hour_column] == 24, 'datetime'] += pd.Timedelta(days=1)
    test_data.loc[test_data[hour_column] == 24, hour_column] = 0
    
    predictions = model.predict(test_features)
    test_data[target_column + ' Prediction'] = predictions
    
    mae = mean_absolute_error(train_target, model.predict(train_features))
    mse = mean_squared_error(train_target, model.predict(train_features))
    
    adjusted_predictions, adjusted_day_predictions = adjust_predictions_to_target_average(test_data, date_column, target_column + ' Prediction', date, target_average_load)
    
    training_window = f"{train_data[date_column].iloc[0]} to {train_data[date_column].iloc[-1]}"
    
    visualize_predictions(test_data, adjusted_predictions, date_column, target_column + ' Prediction', date, mae, mse, training_window, target_average_load)
    
    return adjusted_day_predictions

# Example usage:
adjusted_day_predictions = full_workflow(train_data, test_data, 'date', 'hour', 'TOTAL Actual', ['TOTAL Actual Temperature'], 50, '2024-02-10')

