In [9]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

def preprocess_data(df):
    """
    Preprocess the raw data into features (X) and target (y).
    """
    # Convert Date to Year, Month, Day
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day

    # Add a column for the total number of tourists
    df['Tourist Count'] = df['Tourist Accommodations'] + df['Hotel Overnight Stays']
    
    # Aggregate by removing 'Use' and summing values
    df_agg = df.groupby(['Year', 'Month', 'Day', 'Census Section']).agg({
        'Max Temperature': 'max',  # Maximum of Max Temperature
        'Min Temperature': 'min',  # Minimum of Min Temperature
        'Precipitation': 'sum',   # Total precipitation
        'Tourist Count': 'sum',   # Total tourists
        'Accumulated Consumption': 'sum'  # Total accumulated consumption
    }).reset_index()
    
    # Select relevant features
    features = ['Year', 'Month', 'Day', 'Max Temperature', 'Min Temperature', 
                'Precipitation', 'Census Section', 'Tourist Count']
    target = 'Accumulated Consumption'
    
    X = df_agg[features]
    y = df_agg[target]
    
    return X, y, df_agg




def train_model(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    print(f"Training Score: {model.score(X_train, y_train):.3f}")
    print(f"Test Score: {model.score(X_test, y_test):.3f}")
    
    return model

def predict_daily_consumptions(model, year, month, daily_data):
    """
    Predict daily water consumption for the given month using the provided daily data.
    
    Args:
        model: Trained machine learning model.
        year: Year for prediction.
        month: Month for prediction.
        daily_data: DataFrame containing daily feature values for the month. It must include:
            - Year, Month, Day
            - Max Temperature, Min Temperature
            - Precipitation
            - Census Section
            - Tourist Count
    
    Returns:
        List of daily water consumptions for the given month.
    """
    # Filter the data for the specific year and month
    prediction_data = daily_data[(daily_data['Year'] == year) & (daily_data['Month'] == month)]
    
    # Ensure required columns are present
    required_columns = ['Year', 'Month', 'Day', 'Max Temperature', 'Min Temperature',
                        'Precipitation', 'Census Section', 'Tourist Count']
    
    if not all(col in prediction_data.columns for col in required_columns):
        raise ValueError(f"Missing required columns in input data. Expected: {required_columns}")
    
    # Predict daily consumption
    daily_predictions = model.predict(prediction_data[required_columns])
    
    # Return daily predictions as a list
    return list(daily_predictions)


# Example Usage
if __name__ == "__main__":
    # Load your dataset
    data_file = "../data/local_data/merged_cleaned_data_NEW.csv"  # Replace with your file path
    df = pd.read_csv(data_file)
    
    # Preprocess the data
    X, y, df_processed = preprocess_data(df)
    
    # Train the model
    model = train_model(X, y)
    
    # Save the model for later use
    with open("water_consumption_model.pkl", "wb") as model_file:
        pickle.dump(model, model_file)
    
    # Generate synthetic daily data for June 2024
    daily_data = pd.DataFrame({
        'Year': [2024] * 30,
        'Month': [6] * 30,
        'Day': list(range(1, 31)),
        'Max Temperature': np.random.uniform(24, 26, 30),  # Example temperatures based on given data
        'Min Temperature': np.random.uniform(16, 19, 30),
        'Precipitation': np.random.uniform(0, 1.5, 30),  # Example precipitation values
        'Census Section': [801901001] * 30,  # Example Census Section
        'Tourist Accommodations': np.random.uniform(11000, 18000, 30),  # Adjusted range
        'Hotel Overnight Stays': np.random.uniform(15000, 19000, 30),   # Adjusted range
    })

    # Add Tourist Count as the sum of Tourist Accommodations and Hotel Overnight Stays
    daily_data['Tourist Count'] = daily_data['Tourist Accommodations'] + daily_data['Hotel Overnight Stays']


    # Predict daily consumptions
    daily_consumptions = predict_daily_consumptions(model, year=2024, month=6, daily_data=daily_data)
    print(f"Predicted daily water consumptions for June 2024: {daily_consumptions}")



Training Score: 0.060
Test Score: -0.064
Predicted daily water consumptions for June 2024: [np.float64(21982.422721576673), np.float64(20612.876341380845), np.float64(21502.153092352095), np.float64(21464.547047258304), np.float64(20553.21049469975), np.float64(21245.88306673882), np.float64(21715.182976551227), np.float64(21628.46994949495), np.float64(21333.429413059166), np.float64(21374.489749278506), np.float64(21285.73395165946), np.float64(20665.982636641143), np.float64(21606.282440392937), np.float64(21397.87407495283), np.float64(21579.20903571429), np.float64(20714.276208791212), np.float64(20198.473003968258), np.float64(20475.33988014763), np.float64(20853.09783216783), np.float64(22063.897704184703), np.float64(20337.56975649351), np.float64(21052.109169108666), np.float64(21660.970275613283), np.float64(20839.432923076925), np.float64(20501.62074847375), np.float64(20525.518014707515), np.float64(20150.979485930744), np.float64(20419.710732961492), np.float64(20697.34664

Chronological Model

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

def preprocess_data(df):
    """
    Preprocess the raw data into features (X) and target (y).
    """
    # Convert Date to Year, Month, Day
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day

    # Add a column for the total number of tourists
    df['Tourist Count'] = df['Tourist Accommodations'] + df['Hotel Overnight Stays']
    
    # Select relevant features
    features = ['Year', 'Month', 'Day', 'Max Temperature', 'Min Temperature', 
                'Precipitation', 'Census Section', 'Tourist Count']
    target = 'Accumulated Consumption'
    
    X = df[features]
    y = df[target]
    
    return X, y, df


def train_model(X_train, y_train):
    """
    Train the RandomForest model on the training data.
    """
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    print(f"Training Score: {model.score(X_train, y_train):.3f}")
    
    return model


def split_chronologically(df, months_to_test=6):
    """
    Split the dataset into training and testing sets chronologically.
    
    Args:
        df: Processed DataFrame with a 'Date' column.
        months_to_test: Number of months to reserve for testing.
    
    Returns:
        X_train, X_test, y_train, y_test: Chronologically split features and target.
    """
    # Sort the data by date
    df = df.sort_values(by='Date')

    # Identify the cutoff for the last `months_to_test`
    last_date = df['Date'].max()
    cutoff_date = last_date - pd.DateOffset(months=months_to_test)

    # Split into training and testing sets
    train_data = df[df['Date'] <= cutoff_date]
    test_data = df[df['Date'] > cutoff_date]

    # Features and target
    features = ['Year', 'Month', 'Day', 'Max Temperature', 'Min Temperature',
                'Precipitation', 'Census Section', 'Tourist Count']
    target = 'Accumulated Consumption'

    X_train = train_data[features]
    y_train = train_data[target]
    X_test = test_data[features]
    y_test = test_data[target]

    return X_train, X_test, y_train, y_test


def calculate_loss(y_true, y_pred):
    """
    Calculate the loss (e.g., Mean Absolute Error, Mean Squared Error).
    """
    mae = np.mean(np.abs(y_true - y_pred))  # Mean Absolute Error
    mse = np.mean((y_true - y_pred) ** 2)  # Mean Squared Error
    return mae, mse


def predict_daily_consumptions(model, X_test):
    """
    Predict daily water consumption for the last 6 months using the actual test data.
    
    Args:
        model: Trained machine learning model.
        X_test: Feature DataFrame for the last 6 months.
    
    Returns:
        List of daily water consumptions for the last 6 months.
    """
    # Predict daily consumption
    daily_predictions = model.predict(X_test)
    
    # Return daily predictions as a list
    return list(daily_predictions)


# Example Usage
if __name__ == "__main__":
    # Load your dataset
    data_file = "../data/local_data/merged_cleaned_data_NEW.csv"  # Replace with your file path
    df = pd.read_csv(data_file)
    
    # Preprocess the data
    X, y, df_processed = preprocess_data(df)

    # Add the Date column back for chronological splitting
    df_processed['Date'] = pd.to_datetime(df['Date'])

    # Split the data chronologically (last 6 months as test set)
    X_train, X_test, y_train, y_test = split_chronologically(df_processed, months_to_test=6)

    # Train the model
    model = train_model(X_train, y_train)

    # Predict on the test set (last 6 months)
    y_pred = predict_daily_consumptions(model, X_test)

    # Calculate the loss
    mae, mse = calculate_loss(y_test.values, y_pred)
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    
    # Save the model for later use
    with open("water_consumption_model.pkl", "wb") as model_file:
        pickle.dump(model, model_file)

    # Optional: You could compare the predictions with the actual values for daily consumption
    comparison = pd.DataFrame({
        'Actual': y_test.values,
        'Predicted': y_pred
    })
    print(comparison.head(10))


Training Score: 0.007
Mean Absolute Error (MAE): 23672.23
Mean Squared Error (MSE): 19569408340.50
   Actual     Predicted
0   30786  22754.874631
1   13887  14762.285120
2   15372  14762.285120
3    3162   7845.744835
4    1295   7845.744835


Model Just by Temperature

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Define Functions
def preprocess_data(df):
    # Convert Date to Year, Month, Day
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    
    # Select relevant features
    features = ['Year', 'Month', 'Day', 'Max Temperature', 'Min Temperature']
    target = 'Accumulated Consumption'
    X = df[features]
    y = df[target]
    
    return X, y

def train_model(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    print(f"Training Score: {model.score(X_train, y_train):.3f}")
    print(f"Test Score: {model.score(X_test, y_test):.3f}")
    
    return model

def predict_monthly_consumption(model, year, month, temperature_data):
    """
    Predict daily water consumption for a given year, month, and temperature data.
    
    Args:
        model: Trained machine learning model.
        year: Year for prediction.
        month: Month for prediction.
        temperature_data: A dictionary with keys:
            - 'Max Temperature': List of max daily temperatures for the month
            - 'Min Temperature': List of min daily temperatures for the month
    
    Returns:
        List of daily predicted water consumption.
    """
    # Create a DataFrame for the month
    days_in_month = pd.date_range(start=f"{year}-{month:02d}-01", 
                                  end=f"{year}-{month:02d}-{pd.Period(year=year, month=month, freq='M').days_in_month}")
    prediction_data = pd.DataFrame({
        'Year': [year] * len(days_in_month),
        'Month': [month] * len(days_in_month),
        'Day': [d.day for d in days_in_month],
        'Max Temperature': temperature_data['Max Temperature'],
        'Min Temperature': temperature_data['Min Temperature'],
    })
    
    # Predict daily consumption
    predictions = model.predict(prediction_data)
    return predictions

# Example Usage
if __name__ == "__main__":
    # Load your dataset
    data_file = "../data/local_data/merged_cleaned_data_NEW.csv"  # Replace with your file pa  th
    df = pd.read_csv(data_file)
    
    # Preprocess the data
    X, y = preprocess_data(df)
    
    # Train the model
    model = train_model(X, y)
    
    # Save the model for later use
    with open("water_consumption_model.pkl", "wb") as model_file:
        pickle.dump(model, model_file)
    
    # Example prediction for June 2024
    year = 2024
    month = 6
    temperature_data = {
        'Max Temperature': [30, 31, 32, 33, 34, 35, 36, 36, 37, 38, 39, 40, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24],
        'Min Temperature': [20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14],
    }
    
    # Predict daily consumption
    predictions = predict_monthly_consumption(model, year, month, temperature_data)
    print(f"Predicted daily water consumption for {year}-{month:02d}:")
    print(predictions)
