In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
df = pd.read_csv("data/train.csv")
df.drop(columns=["id"], inplace=True)

In [3]:
df.head()

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_below_0F,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui
0,1,State_1,Commercial,Grocery_store_or_food_market,61242.0,1942.0,11.0,2.4,36,50.5,...,0,14,0,0,0,1.0,1.0,1.0,,248.682615
1,1,State_1,Commercial,Warehouse_Distribution_or_Shipping_center,274000.0,1955.0,45.0,1.8,36,50.5,...,0,14,0,0,0,1.0,,1.0,12.0,26.50015
2,1,State_1,Commercial,Retail_Enclosed_mall,280025.0,1951.0,97.0,1.8,36,50.5,...,0,14,0,0,0,1.0,,1.0,12.0,24.693619
3,1,State_1,Commercial,Education_Other_classroom,55325.0,1980.0,46.0,1.8,36,50.5,...,0,14,0,0,0,1.0,,1.0,12.0,48.406926
4,1,State_1,Commercial,Warehouse_Nonrefrigerated,66000.0,1985.0,100.0,2.4,36,50.5,...,0,14,0,0,0,1.0,1.0,1.0,,3.899395


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75757 entries, 0 to 75756
Data columns (total 63 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year_Factor                75757 non-null  int64  
 1   State_Factor               75757 non-null  object 
 2   building_class             75757 non-null  object 
 3   facility_type              75757 non-null  object 
 4   floor_area                 75757 non-null  float64
 5   year_built                 73920 non-null  float64
 6   energy_star_rating         49048 non-null  float64
 7   ELEVATION                  75757 non-null  float64
 8   january_min_temp           75757 non-null  int64  
 9   january_avg_temp           75757 non-null  float64
 10  january_max_temp           75757 non-null  int64  
 11  february_min_temp          75757 non-null  int64  
 12  february_avg_temp          75757 non-null  float64
 13  february_max_temp          75757 non-null  int

In [5]:
TARGET_COLUMN = "site_eui"

In [6]:
numeric_data = df.select_dtypes(include=[np.number])
categorical_data = df.select_dtypes(exclude=[np.number])

In [7]:
numeric_features = numeric_data.columns.tolist()
numeric_features.remove(TARGET_COLUMN)
categorical_features = categorical_data.columns.tolist()

In [8]:
print(numeric_features)

['Year_Factor', 'floor_area', 'year_built', 'energy_star_rating', 'ELEVATION', 'january_min_temp', 'january_avg_temp', 'january_max_temp', 'february_min_temp', 'february_avg_temp', 'february_max_temp', 'march_min_temp', 'march_avg_temp', 'march_max_temp', 'april_min_temp', 'april_avg_temp', 'april_max_temp', 'may_min_temp', 'may_avg_temp', 'may_max_temp', 'june_min_temp', 'june_avg_temp', 'june_max_temp', 'july_min_temp', 'july_avg_temp', 'july_max_temp', 'august_min_temp', 'august_avg_temp', 'august_max_temp', 'september_min_temp', 'september_avg_temp', 'september_max_temp', 'october_min_temp', 'october_avg_temp', 'october_max_temp', 'november_min_temp', 'november_avg_temp', 'november_max_temp', 'december_min_temp', 'december_avg_temp', 'december_max_temp', 'cooling_degree_days', 'heating_degree_days', 'precipitation_inches', 'snowfall_inches', 'snowdepth_inches', 'avg_temp', 'days_below_30F', 'days_below_20F', 'days_below_10F', 'days_below_0F', 'days_above_80F', 'days_above_90F', 'da

In [9]:
categorical_features

['State_Factor', 'building_class', 'facility_type']

## Model

In [10]:
from sklearn.model_selection import (
    cross_val_score,
    cross_validate,
    train_test_split,
    RandomizedSearchCV
)

from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    FunctionTransformer,
    PolynomialFeatures
)

from sklearn.impute import SimpleImputer

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

from catboost import CatBoostRegressor
from lightgbm.sklearn import LGBMRegressor
from xgboost import XGBRegressor

from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import StackingClassifier

  from pandas import MultiIndex, Int64Index


In [11]:
import time

In [12]:
train_df, val_df = train_test_split(
    df,
    test_size=0.1,
    random_state=123
)

X_train, y_train = train_df.drop(columns=[TARGET_COLUMN]), train_df[TARGET_COLUMN]
X_val, y_val = val_df.drop(columns=[TARGET_COLUMN]), val_df[TARGET_COLUMN]

In [13]:
for dt in [X_train, X_val, y_train, y_val]:
    print(dt.shape)

(68181, 62)
(7576, 62)
(68181,)
(7576,)


In [14]:
def cross_val_scores(model, X_train, y_train, X_val, y_val, return_train_score=False):

    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)

    score_dict = {
        "r2_val": model.score(X_val, y_val),
        "mse_val": mean_squared_error(y_val, y_val_pred),
        "mape_val": mean_absolute_percentage_error(y_val, y_val_pred)
    }

    if return_train_score:
        y_train_pred = model.predict(X_train)

        score_dict["r2_train"] = model.score(X_train, y_train)
        score_dict["mse_train"] = mean_squared_error(y_train, y_train_pred)
        score_dict["mape_train"] = mean_absolute_percentage_error(y_train, y_train_pred)

    scores_result = pd.Series(score_dict)

    return model, scores_result

In [15]:
pipe_numeric_feats = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

column_transformer = make_column_transformer(
    (pipe_numeric_feats, numeric_features),
    (OneHotEncoder(), categorical_features)
)

In [16]:
pipe_lr = make_pipeline(column_transformer, Ridge(max_iter=10000))
pipe_dt = make_pipeline(column_transformer, DecisionTreeRegressor())
# pipe_nb = make_pipeline(
#    column_transformer,
# #    FunctionTransformer(lambda x: x.todense(), accept_sparse=True),
#    GaussianNB())
# pipe_svc = make_pipeline(column_transformer, SVC())
pipe_rf = make_pipeline(column_transformer, RandomForestRegressor())
pipe_xgb = make_pipeline(column_transformer, XGBRegressor(verbosity=0))
pipe_lgbm = make_pipeline(column_transformer, LGBMRegressor())
pipe_catboost = make_pipeline(column_transformer, CatBoostRegressor(verbose=False))

In [17]:
models = {
    "Logistic Regression": pipe_lr,
    "Decision Tree": pipe_dt,
    "Random Forest": pipe_rf,
    "XGB": pipe_xgb,
    "LGBM": pipe_lgbm,
    "Cat Boost": pipe_catboost
}

In [18]:
results = {}

for name, model in models.items():
    print(f"Start {name}!")
    _, results[name] = cross_val_scores(
        model,
        X_train,
        y_train,
        X_val,
        y_val,
        return_train_score=True
    )

    print(f"Done {name}!")


Start Logistic Regression!
Done Logistic Regression!
Start Decision Tree!
Done Decision Tree!
Start Random Forest!
Done Random Forest!
Start XGB!
Done XGB!
Start LGBM!
Done LGBM!
Start Cat Boost!
Done Cat Boost!


In [19]:
pd.DataFrame(results)

Unnamed: 0,Logistic Regression,Decision Tree,Random Forest,XGB,LGBM,Cat Boost
r2_val,0.384811,0.188294,0.552406,0.531974,0.494534,0.529375
mse_val,1910.059142,2520.213329,1389.706644,1453.143337,1569.389215,1461.211709
mape_val,0.515553,0.423858,0.378454,0.424709,0.451351,0.426646
r2_train,0.355856,0.999416,0.936742,0.668158,0.535998,0.629726
mse_train,2206.647047,2.001399,216.703697,1136.793266,1589.53423,1268.44718
mape_train,0.59292,0.001816,0.159051,0.441472,0.496587,0.459088
