In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
#training data
train_data = pd.read_csv('train.csv')
df_train=train_data.copy()

In [3]:
#test data
test_data = pd.read_csv('test.csv')
df_test=test_data.copy()


# data preprocessing

In [4]:
# data preprocessing
# numerical data: impute using median
# numerical to cat: 'year_built','Year_Factor' 
# combined cat:'year_built'--'old','new';
#             'facility_type'.value_counts()<1000--'other'


In [5]:
# transform some numerical varibles to cat 
# numerical to cat: 'year_built','Year_Factor' 
# for missing 'year_built', we will use median value
df_train['year_built'].fillna(df_train['year_built'].median(),inplace=True)
df_test['year_built'].fillna(df_train['year_built'].median(),inplace=True)

In [6]:
# transform some of the numerical to cat values
df_train['Year_Factor']=df_train['Year_Factor'].astype('object',copy=False)
df_train['year_built']=df_train['year_built'].astype('object',copy=False)
df_test['Year_Factor']=df_test['Year_Factor'].astype('object',copy=False)
df_test['year_built']=df_test['year_built'].astype('object',copy=False)
cat_attribs=df_train.select_dtypes(include=['object']).columns
print(cat_attribs)

Index(['Year_Factor', 'State_Factor', 'building_class', 'facility_type',
       'year_built'],
      dtype='object')


In [7]:
# combine cat varibles 'year_built' and 'facility_type'
df_train['year_built']=df_train['year_built'].apply(lambda x: 'old' if x<=1951 else 'new')

df_test['year_built']=df_test['year_built'].apply(lambda x: 'old' if x<=1951 else 'new')


In [8]:
cat_type=df_train['facility_type'].value_counts()[df_train['facility_type'].value_counts()>= 100].index

df_train.loc[df_train['facility_type'].isin((df_train['facility_type'].value_counts()
                            [df_train['facility_type'].value_counts() < 100]).index), 'facility_type'] = 'other'
print(cat_type)   

Index(['Multifamily_Uncategorized', 'Office_Uncategorized',
       'Education_Other_classroom', 'Lodging_Hotel', '2to4_Unit_Building',
       'Commercial_Other', '5plus_Unit_Building', 'Warehouse_Nonrefrigerated',
       'Retail_Uncategorized', 'Education_College_or_university',
       'Nursing_Home', 'Education_Uncategorized',
       'Mixed_Use_Commercial_and_Residential',
       'Lodging_Dormitory_or_fraternity_sorority',
       'Warehouse_Distribution_or_Shipping_center', 'Warehouse_Selfstorage',
       'Grocery_store_or_food_market', 'Office_Medical_non_diagnostic',
       'Health_Care_Inpatient', 'Religious_worship', 'Industrial',
       'Warehouse_Uncategorized', 'Mixed_Use_Predominantly_Commercial',
       'Parking_Garage', 'Office_Bank_or_other_financial',
       'Public_Assembly_Library', 'Public_Safety_Fire_or_police_station',
       'Public_Assembly_Other', 'Service_Vehicle_service_repair_shop',
       'Retail_Enclosed_mall', 'Warehouse_Refrigerated',
       'Retail_Strip_sh

In [9]:
# combine the sparse cat for test data as well
df_test.loc[~df_test['facility_type'].isin(cat_type), 'facility_type'] = 'other'
print(sorted(df_train['facility_type'].unique()))
print(sorted(df_test['facility_type'].unique()))
print(sorted(df_train['facility_type'].unique())==sorted(df_test['facility_type'].unique()))

['2to4_Unit_Building', '5plus_Unit_Building', 'Commercial_Other', 'Education_College_or_university', 'Education_Other_classroom', 'Education_Preschool_or_daycare', 'Education_Uncategorized', 'Grocery_store_or_food_market', 'Health_Care_Inpatient', 'Industrial', 'Laboratory', 'Lodging_Dormitory_or_fraternity_sorority', 'Lodging_Hotel', 'Mixed_Use_Commercial_and_Residential', 'Mixed_Use_Predominantly_Commercial', 'Multifamily_Uncategorized', 'Nursing_Home', 'Office_Bank_or_other_financial', 'Office_Medical_non_diagnostic', 'Office_Uncategorized', 'Parking_Garage', 'Public_Assembly_Entertainment_culture', 'Public_Assembly_Library', 'Public_Assembly_Other', 'Public_Safety_Fire_or_police_station', 'Religious_worship', 'Retail_Enclosed_mall', 'Retail_Strip_shopping_mall', 'Retail_Uncategorized', 'Service_Vehicle_service_repair_shop', 'Warehouse_Distribution_or_Shipping_center', 'Warehouse_Nonrefrigerated', 'Warehouse_Refrigerated', 'Warehouse_Selfstorage', 'Warehouse_Uncategorized', 'other']

In [10]:
# compute energy_star ana values with median of each facility type
df_train.energy_star_rating.isnull().value_counts()
df_train.energy_star_rating.head(10)

0     11.0
1     45.0
2     97.0
3     46.0
4    100.0
5      NaN
6     56.0
7     99.0
8     98.0
9     83.0
Name: energy_star_rating, dtype: float64

In [11]:
df_train.groupby("facility_type")["energy_star_rating"].median()

facility_type
2to4_Unit_Building                           76.0
5plus_Unit_Building                          81.0
Commercial_Other                             76.5
Education_College_or_university              52.0
Education_Other_classroom                    68.0
Education_Preschool_or_daycare                NaN
Education_Uncategorized                       7.0
Grocery_store_or_food_market                 54.0
Health_Care_Inpatient                        57.0
Industrial                                   87.0
Laboratory                                    1.0
Lodging_Dormitory_or_fraternity_sorority     64.0
Lodging_Hotel                                47.0
Mixed_Use_Commercial_and_Residential         74.0
Mixed_Use_Predominantly_Commercial           79.0
Multifamily_Uncategorized                    60.0
Nursing_Home                                 36.0
Office_Bank_or_other_financial               75.5
Office_Medical_non_diagnostic                55.0
Office_Uncategorized                

In [12]:
# fill the training data 'enery_star' with the median value of each 'facility-type'
df_train["energy_star_rating"] = df_train.groupby("facility_type")["energy_star_rating"].transform(lambda x: x.fillna(x.median()))
#df_train.energy_star_rating.isnull().value_counts()
df_train.energy_star_rating.head(10)

0     11.0
1     45.0
2     97.0
3     46.0
4    100.0
5     83.0
6     56.0
7     99.0
8     98.0
9     83.0
Name: energy_star_rating, dtype: float64

In [13]:
df_test.energy_star_rating[10:15]

10    44.0
11    60.0
12     NaN
13    82.0
14    89.0
Name: energy_star_rating, dtype: float64

In [14]:
df_train.columns

Index(['Year_Factor', 'State_Factor', 'building_class', 'facility_type',
       'floor_area', 'year_built', 'energy_star_rating', 'ELEVATION',
       'january_min_temp', 'january_avg_temp', 'january_max_temp',
       'february_min_temp', 'february_avg_temp', 'february_max_temp',
       'march_min_temp', 'march_avg_temp', 'march_max_temp', 'april_min_temp',
       'april_avg_temp', 'april_max_temp', 'may_min_temp', 'may_avg_temp',
       'may_max_temp', 'june_min_temp', 'june_avg_temp', 'june_max_temp',
       'july_min_temp', 'july_avg_temp', 'july_max_temp', 'august_min_temp',
       'august_avg_temp', 'august_max_temp', 'september_min_temp',
       'september_avg_temp', 'september_max_temp', 'october_min_temp',
       'october_avg_temp', 'october_max_temp', 'november_min_temp',
       'november_avg_temp', 'november_max_temp', 'december_min_temp',
       'december_avg_temp', 'december_max_temp', 'cooling_degree_days',
       'heating_degree_days', 'precipitation_inches', 'snowfall_inc

# pipelines

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
#from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error

In [16]:
#training data
y_train=df_train.site_eui
# drop useless variables such as 'id'
X_train=df_train.drop(['site_eui','id'],axis=1)

In [17]:
# test data
X_test=df_test.drop('id',axis=1)
print(X_train.columns==X_test.columns)


[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True]


In [18]:
cat_attr=X_train.select_dtypes(include=['object']).columns
num_attr=X_train.select_dtypes(exclude=['object']).columns
print(cat_attr)
print(num_attr)

Index(['Year_Factor', 'State_Factor', 'building_class', 'facility_type',
       'year_built'],
      dtype='object')
Index(['floor_area', 'energy_star_rating', 'ELEVATION', 'january_min_temp',
       'january_avg_temp', 'january_max_temp', 'february_min_temp',
       'february_avg_temp', 'february_max_temp', 'march_min_temp',
       'march_avg_temp', 'march_max_temp', 'april_min_temp', 'april_avg_temp',
       'april_max_temp', 'may_min_temp', 'may_avg_temp', 'may_max_temp',
       'june_min_temp', 'june_avg_temp', 'june_max_temp', 'july_min_temp',
       'july_avg_temp', 'july_max_temp', 'august_min_temp', 'august_avg_temp',
       'august_max_temp', 'september_min_temp', 'september_avg_temp',
       'september_max_temp', 'october_min_temp', 'october_avg_temp',
       'october_max_temp', 'november_min_temp', 'november_avg_temp',
       'november_max_temp', 'december_min_temp', 'december_avg_temp',
       'december_max_temp', 'cooling_degree_days', 'heating_degree_days',
       'precip

In [19]:
# preprocessing for numerical data
num_transformer=Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', RobustScaler())
])

# Preprocessing for categorical data
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# buldle preprocessing for numerical and cat data
preprocessor= ColumnTransformer(
    transformers=[
        ('num',num_transformer,num_attr),
        ('cat',cat_transformer,cat_attr)
    ]
)

In [20]:
# define the model
from xgboost import XGBRegressor
xgb_reg=XGBRegressor(n_estimators=100,learning_rate=0.1,early_stopping_rounds=5,verbosity = 0)

In [21]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model',xgb_reg)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)
# training error
eui_train_prediction=my_pipeline.predict(X_train)
train_rmse=np.sqrt(mean_squared_error(y_train,eui_train_prediction))
# Preprocessing of validation data, get predictions
prediction=my_pipeline.predict(X_test)


In [22]:
#cv
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
cv_scores=cross_val_score(xgb_reg, X_train, y_train, cv=10, scoring="neg_mean_squared_error")
xgb_rmse_scores=np.sqrt(-cv_scores)
print('Scores:', xgb_rmse_scores)
print('Mean Scores:', xgb_rmse_scores.mean())
print('Standard Deviation:', xgb_rmse_scores.std())

Scores: [56.63757213 46.79889362 49.39152963 49.83796569 43.89568321 37.53565742
 30.08672134 38.26048733 48.52715697 64.08236055]
Mean Scores: 46.50540278884937
Standard Deviation: 9.273825154974917


In [23]:
train_rmse
#39.14488849932373

39.14488849932373

In [24]:
prediction[0:10]

array([256.34363, 192.48141, 230.40848, 248.4103 , 261.3797 , 264.278  ,
       251.30862, 225.36647, 170.95361, 286.35666], dtype=float32)

In [25]:
df_results = pd.DataFrame(columns = ['id', 'site_eui'])
df_results.id=df_test.id
df_results.site_eui=prediction
df_results.head()
df_results.to_csv('df_prediction_xgboost.csv', index=None,sep=',')
# the RMSE is 45.530 after I submitted the prediction to Kaggle competition 

In [26]:
# error analysis
df_train['eui_train_prediction']=pd.DataFrame(eui_train_prediction)
df_train['eui_train_prediction_error']=abs(df_train.site_eui-df_train.eui_train_prediction)
df_train.to_csv('df_train_error.csv', index=None,sep=',')

In [27]:
# #CV
# from sklearn.model_selection import cross_val_score
# scores_xgb = cross_val_score(
#     xgb_reg, X_train,y_train, cv=10, scoring="neg_mean_squared_error"
# )
# xgb_rmse_scores=np.sqrt(-scores_xgb)
# print('RF Scores:', xgb_rmse_scores)
# print('RF Mean Scores:', xgb_rmse_scores.mean())
# print('Standard Deviation:', xgb_rmse_scores.std())