# Import Required Libraries
Import the necessary libraries, including pandas, numpy, matplotlib, plotly, sklearn, xgboost, seaborn, multiprocessing, and logging.

In [None]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import seaborn as sns
from multiprocessing import Pool
import logging
import time

# Configure Logging
Configure the logging settings to display information with timestamps.

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define fit_model Function
Define the fit_model function that trains an XGBoost model on a chunk of the dataset.Splitting data into chunks improves memory efficiency and processing speed, especially for large datasets that cannot fit into RAM.

In [None]:
# Define fit_model Function
def fit_model(df_chunk):
    """
    Trains an XGBoost model on a chunk of the dataset.

    Parameters:
    df_chunk (DataFrame): A chunk of the dataset containing features and target variable.

    Returns:
    XGBRegressor: Trained XGBoost model.
    """
    # Separate features and target variable
    X = df_chunk.drop(columns=["y", "ds"])
    y = df_chunk["y"]

    # Filter NaN, infinite, or extremely large values
    valid_indices = y.notna() & np.isfinite(y) & (y < np.finfo(np.float64).max)
    X = X.loc[valid_indices]
    y = y.loc[valid_indices]

    # Initialize and train the XGBoost model
    model = XGBRegressor()
    model.fit(X, y)

    return model

# Define main Function
Define the main function that orchestrates the data loading, preprocessing, model training, and evaluation.

In [None]:
# Define main Function
def main():
    """
    Orchestrates the data loading, preprocessing, model training, and evaluation.
    """
    start_time = time.time()
    logging.info("Starting the main function")

    # Load dataset
    dataset = fetch_ucirepo(id=235)
    X = dataset.data.features
    y = dataset.data.targets

    # Combine data
    df = pd.concat([X, y], axis=1)

    # Create date column
    df["date"] = pd.to_datetime(df["Date"] + " " + df["Time"], dayfirst=True)

    # Drop unnecessary columns
    df.drop(columns=["Date", "Time"], inplace=True)
    df.set_index("date", inplace=True)

    # Fill missing values
    df.fillna(method="ffill", inplace=True)
    df.fillna(method="bfill", inplace=True)

    # Convert 'Global_active_power' to numeric
    df["Global_active_power"] = pd.to_numeric(df["Global_active_power"], errors="coerce")

    # Prepare dataset for model
    df_xgboost = df[["Global_active_power"]].reset_index()
    df_xgboost.columns = ["ds", "y"]

    # Add date features
    df_xgboost["hour"] = df_xgboost["ds"].dt.hour
    df_xgboost["day"] = df_xgboost["ds"].dt.day
    df_xgboost["month"] = df_xgboost["ds"].dt.month
    df_xgboost["year"] = df_xgboost["ds"].dt.year

    # Clean NaN and infinite values
    df_xgboost.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_xgboost.dropna(inplace=True)

    # Split into train and test sets
    train_size = int(len(df_xgboost) * 0.8)
    train_df = df_xgboost[:train_size]
    test_df = df_xgboost[train_size:]

    # Split train set into chunks
    num_chunks = 4
    train_chunks = np.array_split(train_df, num_chunks)

    # Train model in parallel
    logging.info("Starting model training with multiprocessing")
    with Pool(num_chunks) as pool:
        models = pool.map(fit_model, train_chunks)

    # Make predictions
    logging.info("Making predictions")
    forecasts = [model.predict(test_df.drop(columns=["ds", "y"])) for model in models]
    forecast = pd.DataFrame({"ds": test_df["ds"], "yhat": np.mean(forecasts, axis=0)})

    # Align actual and forecasted values
    common_dates = test_df.set_index("ds").index.intersection(forecast.set_index("ds").index)
    if len(common_dates) == 0:
        logging.error("No common dates found between actual and forecasted values. Check the data.")
        return

    y_true = test_df.set_index("ds").loc[common_dates, "y"]
    y_pred = forecast.set_index("ds").loc[common_dates, "yhat"]

    logging.info(f"Before align - y_true: {len(y_true)}, y_pred: {len(y_pred)}")

    y_true, y_pred = y_true.align(y_pred, join='inner')
    y_true.dropna(inplace=True)
    y_pred.dropna(inplace=True)

    if len(y_true) != len(y_pred):
        y_pred = y_pred.reindex(y_true.index, method='nearest')

    logging.info(f"After align - y_true: {len(y_true)}, y_pred: {len(y_pred)}")

    if len(y_true) != len(y_pred):
        logging.error(f"Final y_true length: {len(y_true)}, y_pred length: {len(y_pred)}")
        return

    # Visualize the forecast
    fig = px.line(forecast, x="ds", y="yhat", title="Electricity Consumption Forecast")
    fig.show()

    # Plot: Actual vs Predicted
    plt.figure(figsize=(12, 6))
    plt.plot(y_true, label="Actual Values", color="blue")
    plt.plot(y_pred, label="Predicted Values", color="red", linestyle="dashed")
    plt.xlabel("Date")
    plt.ylabel("Energy Consumption")
    plt.title("Actual vs. Predicted Energy Consumption")
    plt.legend()
    plt.show()

    # Residuals Analysis
    residuals = y_true - y_pred
    logging.info(f"Residuals:\n{residuals}")

    plt.figure(figsize=(12, 6))
    plt.plot(residuals, label="Residuals", color="purple")
    plt.axhline(y=0, color="black", linestyle="dashed")
    plt.xlabel("Date")
    plt.ylabel("Residual Values")
    plt.title("Residual Analysis")
    plt.legend()
    plt.show()

    # Error Distribution Plot
    sns.histplot(residuals, bins=30, kde=True, color="orange")
    plt.xlabel("Error (Actual - Predicted)")
    plt.ylabel("Frequency")
    plt.title("Error Distribution")
    plt.show()

    # Calculate Error Metrics
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    # Log Error Metrics
    logging.info(f"MAE: {mae:.4f}")
    logging.info(f"MSE: {mse:.4f}")
    logging.info(f"RMSE: {rmse:.4f}")
    logging.info(f"MAPE: {mape:.2f}%")

    end_time = time.time()
    logging.info(f"Finished the main function in {end_time - start_time:.2f} seconds")

if __name__ == '__main__':
    main()


Columns (2,3,4,5,6,7) have mixed types. Specify dtype option on import or set low_memory=False.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.



# 1)Load Dataset
Load the dataset using the fetch_ucirepo function and extract features and targets.

In [None]:
# Load the dataset using the fetch_ucirepo function
dataset = fetch_ucirepo(id=235)

# Extract features and targets
X = dataset.data.features
y = dataset.data.targets

# Combine data into a single DataFrame
df = pd.concat([X, y], axis=1)

# Display the first few rows of the dataset
df.head()

# 2)Combine and Preprocess Data
Combine the features and targets into a single DataFrame, create a date column, and preprocess the data by filling missing values and converting data types.

In [None]:
# Create a date column by combining 'Date' and 'Time' columns
df["date"] = pd.to_datetime(df["Date"] + " " + df["Time"], dayfirst=True)

# Drop the original 'Date' and 'Time' columns as they are no longer needed
df.drop(columns=["Date", "Time"], inplace=True)

# Set the 'date' column as the index of the DataFrame
df.set_index("date", inplace=True)

# Fill missing values using forward fill and backward fill methods
df.fillna(method="ffill", inplace=True)
df.fillna(method="bfill", inplace=True)

# Convert 'Global_active_power' column to numeric, coercing errors to NaN
df["Global_active_power"] = pd.to_numeric(df["Global_active_power"], errors="coerce")

# Display the first few rows of the preprocessed DataFrame
df.head()

# 3) Prepare Dataset for Model
Prepare the dataset for the XGBoost model by creating a new DataFrame with the necessary columns and adding date features.

In [None]:
# Create a new DataFrame with the necessary columns for the XGBoost model
df_xgboost = df[["Global_active_power"]].reset_index()

# Rename columns to 'ds' (date) and 'y' (target variable)
df_xgboost.columns = ["ds", "y"]

# Add date features: hour, day, month, and year
df_xgboost["hour"] = df_xgboost["ds"].dt.hour
df_xgboost["day"] = df_xgboost["ds"].dt.day
df_xgboost["month"] = df_xgboost["ds"].dt.month
df_xgboost["year"] = df_xgboost["ds"].dt.year

# Clean NaN and infinite values
df_xgboost.replace([np.inf, -np.inf], np.nan, inplace=True)
df_xgboost.dropna(inplace=True)

# Display the first few rows of the prepared DataFrame
df_xgboost.head()

# 4)Split Data into Train and Test Sets
Split the dataset into training and testing sets, and further split the training set into chunks for parallel processing.

In [None]:
# Determine the size of the training set (80% of the data)
train_size = int(len(df_xgboost) * 0.8)

# Split the data into training and testing sets
train_df = df_xgboost[:train_size]
test_df = df_xgboost[train_size:]

# Display the number of rows in the training and testing sets
print(f"Training set size: {len(train_df)} rows")
print(f"Testing set size: {len(test_df)} rows")

# Split the training set into chunks for parallel processing
num_chunks = 4
train_chunks = np.array_split(train_df, num_chunks)

# Display the number of rows in each chunk
for i, chunk in enumerate(train_chunks):
    print(f"Chunk {i+1} size: {len(chunk)} rows")

#5) Train Model in Parallel

Train the XGBoost model in parallel using the multiprocessing library to speed up training by utilizing multiple CPU cores efficiently.

In [None]:
# Train the XGBoost model in parallel using the multiprocessing library
logging.info("Starting model training with multiprocessing")

# Use Pool to parallelize the model training process
with Pool(num_chunks) as pool:
    models = pool.map(fit_model, train_chunks)

# Display the number of models trained
print(f"Number of models trained: {len(models)}")

# 6)Make Predictions
Make predictions on the test set using the trained models and combine the forecasts.

In [None]:
# Make predictions on the test set using the trained models
logging.info("Making predictions")
forecasts = [model.predict(test_df.drop(columns=["ds", "y"])) for model in models]

# Combine the forecasts by averaging the predictions from all models
forecast = pd.DataFrame({"ds": test_df["ds"], "yhat": np.mean(forecasts, axis=0)})

# Display the first few rows of the forecasted values
forecast.head()

# 7)Align Actual and Forecasted Values
Align the actual and forecasted values based on common dates and handle any discrepancies.

In [None]:
# Find common dates between the actual and forecasted values
common_dates = test_df.set_index("ds").index.intersection(forecast.set_index("ds").index)
if len(common_dates) == 0:
    logging.error("No common dates found between actual and forecasted values. Check the data.")
else:
    # Extract the actual and forecasted values for the common dates
    y_true = test_df.set_index("ds").loc[common_dates, "y"]
    y_pred = forecast.set_index("ds").loc[common_dates, "yhat"]

    logging.info(f"Before align - y_true: {len(y_true)}, y_pred: {len(y_pred)}")

    # Align the actual and forecasted values
    y_true, y_pred = y_true.align(y_pred, join='inner')
    y_true.dropna(inplace=True)
    y_pred.dropna(inplace=True)

    # Handle any discrepancies in the lengths of y_true and y_pred
    if len(y_true) != len(y_pred):
        y_pred = y_pred.reindex(y_true.index, method='nearest')

    logging.info(f"After align - y_true: {len(y_true)}, y_pred: {len(y_pred)}")

    if len(y_true) != len(y_pred):
        logging.error(f"Final y_true length: {len(y_true)}, y_pred length: {len(y_pred)}")

# 8)Visualize the Forecast
Visualize the forecasted values using Plotly.

In [None]:
# Visualize the forecasted values using Plotly
fig = px.line(forecast, x="ds", y="yhat", title="Electricity Consumption Forecast")
fig.show()


# 9)Plot Actual vs Predicted
Plot the actual vs predicted values using Matplotlib.

In [None]:
# Plot the actual vs predicted values using Matplotlib
plt.figure(figsize=(12, 6))
plt.plot(y_true, label="Actual Values", color="blue")
plt.plot(y_pred, label="Predicted Values", color="red", linestyle="dashed")
plt.xlabel("Date")
plt.ylabel("Energy Consumption")
plt.title("Actual vs. Predicted Energy Consumption")
plt.legend()
plt.show()

# 10)Residuals Analysis
Analyze the residuals by plotting them and examining their distribution.

In [None]:
# Calculate residuals (difference between actual and predicted values)
residuals = y_true - y_pred

# Log residuals
logging.info(f"Residuals:\n{residuals}")

# Plot residuals over time
plt.figure(figsize=(12, 6))
plt.plot(residuals, label="Residuals", color="purple")
plt.axhline(y=0, color="black", linestyle="dashed")
plt.xlabel("Date")
plt.ylabel("Residual Values")
plt.title("Residual Analysis")
plt.legend()
plt.show()


# 11)Error Distribution Plot
Plot the error distribution using Seaborn.

In [None]:
# Plot the distribution of residuals using Seaborn
sns.histplot(residuals, bins=30, kde=True, color="orange")
plt.xlabel("Error (Actual - Predicted)")
plt.ylabel("Frequency")
plt.title("Error Distribution")
plt.show()

#12)Calculate and Log Error Metrics
Calculate error metrics such as MAE, MSE, RMSE, and MAPE, and log them.

In [None]:
# Calculate and Log Error Metrics

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_true, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_true, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Log the calculated error metrics
logging.info(f"MAE: {mae:.4f}")
logging.info(f"MSE: {mse:.4f}")
logging.info(f"RMSE: {rmse:.4f}")
logging.info(f"MAPE: {mape:.2f}%")

# Display the error metrics
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

#13)Start the main function

In [None]:
if __name__ == '__main__':
    main()