# 🏡 Site Energy UI Prediction 

<img src = "https://www.clearias.com/up/energy-efficiency.jpg">

#### What the project is about?

### Section 1 : Setting Up The Project

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import matplotlib.pyplot as plt

from copy import deepcopy
from collections import defaultdict

from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV, train_test_split, LeaveOneGroupOut

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

### Section 2 : Helper Functions

In [2]:
def read_data(train_path, test_path):
    """Read in train and test data for a kaggle competition.

    Parameters
    ----------
    train_path : str
        The path to the training data.
    test_path : str
        The path to the test data

    Returns
    -------
    train_df, test_df : pandas DataFrames
        The train and test datasets.
    """
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    return train_df, test_df

In [3]:
def get_duplicates(df, drop_cols=None):
    """Determine and return the duplicated values in a dataframe.

    Parameters
    ----------
    df : pandas DataFrame
        The dataframe to check
    drop_cols : str or list of str, optional
        The columns to drop before returning duplicates.

    Returns
    -------
    pandas DataFrame
        A dataframe containing the rows with duplicated values.
    """
    if drop_cols is not None:
        return df[df.drop(columns=drop_cols).duplicated()]
    else:
        return df[df.duplicated()]


In [4]:
def remove_duplicates(df, drop_cols=None):
    """Removes the duplicated values in a dataframe.

    Parameters
    ----------
    df : pandas DataFrame
        The dataframe to check
    drop_cols : str or list of str, optional
        The columns to drop before removing duplicates.
    Returns
    -------
    pandas DataFrame
        A dataframe without duplicated.
    """
    df_clean = df.copy()

    if drop_cols is not None:
        df_clean = df_clean[~df_clean.drop(columns=drop_cols).duplicated()]
    else:
        df_clean = df_clean[~df_clean.duplicated()]
    
    return df_clean.reset_index(drop=True)


In [5]:
def split_building_data(df, groups):
    """Splits the WiDS 2022 dataset based on groups of facility types.

    Parameters
    ----------
    df : pandas DataFrame
        The WiDS train or test dataset.
    groups : dict (key: str, value: set)
        The dictionary of facility types.  The key should be the desired group name.
        The value should be a set containing the facility types in a given group.

    Returns
    -------
    dfs : dict
        A dictionary of pandas DataFrames, one for each group.
    """
    dfs = {}

    for name, group in groups.items():
        group_df = df.query("facility_type in @group")
        dfs[name] = group_df.reset_index(drop=True)

    return dfs


In [6]:
# Scope to refactor
def get_manual_facility_groups():
    """Returns the manual facility groups used in my final WiDS solution.

    Returns
    -------
    The dictionary of facility types.
        The key is group name (first word in facility type).
        The value is a set containing the facility types in a given group.
    """

    facility_groups = {
        "2to4_5plus_Mixed": {
            "2to4_Unit_Building",
            "5plus_Unit_Building",
            "Mixed_Use_Predominantly_Residential",
        },
        "Commercial_Education_Mixed_Industrial_Parking": {
            "Education_College_or_university",
            "Education_Other_classroom",
            "Education_Preschool_or_daycare",
            "Education_Uncategorized",
            "Commercial_Other",
            "Commercial_Unknown",
            "Mixed_Use_Commercial_and_Residential",
            "Mixed_Use_Predominantly_Commercial",
            "Industrial",
            "Parking_Garage",
        },
        "Food_Grocery": {
            "Food_Sales",
            "Food_Service_Other",
            "Food_Service_Restaurant_or_cafeteria",
            "Food_Service_Uncategorized",
            "Grocery_store_or_food_market",
        },
        "Health": {
            "Health_Care_Inpatient",
            "Health_Care_Outpatient_Clinic",
            "Health_Care_Outpatient_Uncategorized",
            "Health_Care_Uncategorized",
        },
        "Laboratory_Data": {"Laboratory", "Data_Center"},
        "Lodging": {
            "Lodging_Dormitory_or_fraternity_sorority",
            "Lodging_Hotel",
            "Lodging_Other",
            "Lodging_Uncategorized",
        },
        "Multifamily": {"Multifamily_Uncategorized"},
        "Office_Nursing": {
            "Office_Bank_or_other_financial",
            "Office_Medical_non_diagnostic",
            "Office_Mixed_use",
            "Office_Uncategorized",
            "Nursing_Home",
        },
        "Public": {
            "Public_Assembly_Drama_theater",
            "Public_Assembly_Entertainment_culture",
            "Public_Assembly_Library",
            "Public_Assembly_Movie_Theater",
            "Public_Assembly_Other",
            "Public_Assembly_Recreation",
            "Public_Assembly_Social_meeting",
            "Public_Assembly_Stadium",
            "Public_Assembly_Uncategorized",
            "Public_Safety_Courthouse",
            "Public_Safety_Fire_or_police_station",
            "Public_Safety_Penitentiary",
            "Public_Safety_Uncategorized",
        },
        "Religious": {"Religious_worship"},
        "Retail": {
            "Retail_Enclosed_mall",
            "Retail_Strip_shopping_mall",
            "Retail_Uncategorized",
            "Retail_Vehicle_dealership_showroom",
        },
        "Warehouse_Service": {
            "Warehouse_Distribution_or_Shipping_center",
            "Warehouse_Nonrefrigerated",
            "Warehouse_Refrigerated",
            "Warehouse_Selfstorage",
            "Warehouse_Uncategorized",
            "Service_Drycleaning_or_Laundry",
            "Service_Uncategorized",
            "Service_Vehicle_service_repair_shop",
        },
    }

    return facility_groups


In [7]:
def count_missing(df):
    """Counts the missing data in a dataframe.

    Parameters
    ----------
    df : pandas DataFrame
        The dataframe to count the missing data in.

    Returns
    -------
    pandas DataFrame
        A summary of missing data (counts and %)
    """
    missing_df = pd.DataFrame(
        df.isna().sum().sort_values(ascending=False), columns=["count"]
    )
    missing_df["percent"] = missing_df["count"] / df.shape[0]
    return missing_df.query("count != 0")