In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Define Functions
def preprocess_data(df):
    # Convert Date to Year, Month, Day
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    
    # Aggregating the data by Census Section, District, Date, and Use type
    df_agg = df.groupby(['Year', 'Month', 'Day', 'Census Section', 'District', 'Use']).agg({
        'Accumulated Consumption': 'sum',
        'Max Temperature': 'mean',
        'Min Temperature': 'mean',
        'Precipitation': 'sum',
        'Tourist Accommodations': 'mean',
        'Hotel Overnight Stays': 'mean'
    }).reset_index()

    # Select relevant features
    features = ['Year', 'Month', 'Day', 'Max Temperature', 'Min Temperature', 'Precipitation', 'Tourist Accommodations']
    target = 'Accumulated Consumption'
    X = df_agg[features]
    y = df_agg[target]
    
    return X, y, df_agg

def train_model(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    print(f"Training Score: {model.score(X_train, y_train):.3f}")
    print(f"Test Score: {model.score(X_test, y_test):.3f}")
    
    return model

def predict_total_consumption(model, year, month, tourist_data):
    """
    Predict total water consumption for the given month based on input year, month, and tourist data.
    
    Args:
        model: Trained machine learning model.
        year: Year for prediction.
        month: Month for prediction.
        tourist_data: Total number of tourists in the month.
    
    Returns:
        Total predicted water consumption for the month.
    """
    # Create a DataFrame for the month (we assume 30 days in a month for simplicity)
    days_in_month = pd.date_range(start=f"{year}-{month:02d}-01", 
                                  end=f"{year}-{month:02d}-{pd.Period(year=year, month=month, freq='M').days_in_month}")
    
    prediction_data = pd.DataFrame({
        'Year': [year] * len(days_in_month),
        'Month': [month] * len(days_in_month),
        'Day': [d.day for d in days_in_month],
        'Max Temperature': [30] * len(days_in_month),  # Example: average temperature
        'Min Temperature': [20] * len(days_in_month),  # Example: average temperature
        'Precipitation': [1] * len(days_in_month),  # Example: average precipitation
        'Tourist Accommodations': [tourist_data] * len(days_in_month),  # Total tourists for the month
    })
    
    # Predict daily consumption (we sum predictions per day)
    daily_predictions = model.predict(prediction_data)
    
    # Return total consumption for the entire month
    total_consumption = daily_predictions.sum()
    return total_consumption

# Example Usage
if __name__ == "__main__":
    # Load your dataset
    data_file = "../data/local_data/merged_cleaned_data_NEW.csv"  # Replace with your file path
    df = pd.read_csv(data_file)
    
    # Preprocess the data
    X, y, df_agg = preprocess_data(df)
    
    # Train the model
    model = train_model(X, y)
    
    # Save the model for later use
    with open("water_consumption_model.pkl", "wb") as model_file:
        pickle.dump(model, model_file)
    
    # Example prediction for June 2024
    year = 2024
    month = 6
    tourist_data = 1500000  # Example number of tourists
    
    # Predict total consumption for the month
    total_consumption = predict_total_consumption(model, year, month, tourist_data)
    print(f"Predicted total water consumption for {year}-{month:02d}: {total_consumption:.2f} mÂ³")


In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Define Functions
def preprocess_data(df):
    # Convert Date to Year, Month, Day
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    
    # Select relevant features
    features = ['Year', 'Month', 'Day', 'Max Temperature', 'Min Temperature']
    target = 'Accumulated Consumption'
    X = df[features]
    y = df[target]
    
    return X, y

def train_model(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    print(f"Training Score: {model.score(X_train, y_train):.3f}")
    print(f"Test Score: {model.score(X_test, y_test):.3f}")
    
    return model

def predict_monthly_consumption(model, year, month, temperature_data):
    """
    Predict daily water consumption for a given year, month, and temperature data.
    
    Args:
        model: Trained machine learning model.
        year: Year for prediction.
        month: Month for prediction.
        temperature_data: A dictionary with keys:
            - 'Max Temperature': List of max daily temperatures for the month
            - 'Min Temperature': List of min daily temperatures for the month
    
    Returns:
        List of daily predicted water consumption.
    """
    # Create a DataFrame for the month
    days_in_month = pd.date_range(start=f"{year}-{month:02d}-01", 
                                  end=f"{year}-{month:02d}-{pd.Period(year=year, month=month, freq='M').days_in_month}")
    prediction_data = pd.DataFrame({
        'Year': [year] * len(days_in_month),
        'Month': [month] * len(days_in_month),
        'Day': [d.day for d in days_in_month],
        'Max Temperature': temperature_data['Max Temperature'],
        'Min Temperature': temperature_data['Min Temperature'],
    })
    
    # Predict daily consumption
    predictions = model.predict(prediction_data)
    return predictions

# Example Usage
if __name__ == "__main__":
    # Load your dataset
    data_file = "../data/local_data/merged_cleaned_data_NEW.csv"  # Replace with your file pa  th
    df = pd.read_csv(data_file)
    
    # Preprocess the data
    X, y = preprocess_data(df)
    
    # Train the model
    model = train_model(X, y)
    
    # Save the model for later use
    with open("water_consumption_model.pkl", "wb") as model_file:
        pickle.dump(model, model_file)
    
    # Example prediction for June 2024
    year = 2024
    month = 6
    temperature_data = {
        'Max Temperature': [30, 31, 32, 33, 34, 35, 36, 36, 37, 38, 39, 40, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24],
        'Min Temperature': [20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14],
    }
    
    # Predict daily consumption
    predictions = predict_monthly_consumption(model, year, month, temperature_data)
    print(f"Predicted daily water consumption for {year}-{month:02d}:")
    print(predictions)
