In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
cmap=sns.color_palette("Spectral")
#!pip install catboost
from catboost import CatBoostRegressor

In [None]:
train = pd.read_csv("../input/widsdatathon2022/train.csv")
test = pd.read_csv("../input/widsdatathon2022/test.csv")
facility_type_category = pd.read_csv("../input/facility-type/facility_type_catgry.csv")

In [None]:
display(train, test, facility_type_category)

## Checking for missing values

In [None]:
train.info()

There are null values in the columns 'year_built', 'energy_star_rating', 'direction_max_wind_speed', 'direction_peak_wind_speed', 'max_wind_speed' and 'days_with_fog'.

# Deep clean

In [None]:
# training set
train['year_built'].replace(0,np.nan,inplace=True)
train['year_built'].describe
A=train['year_built'].mean()
train['year_built'].fillna(value=A,inplace=True)
train['year_built'].unique()
train['vol']=train['ELEVATION']*train['floor_area']
train['face']=train['ELEVATION']*np.sqrt(train['floor_area'])
train['days_with_fog_B']=train['days_with_fog'].notna()
train['year_built_B']=train['year_built'].notna()
train['energy_star_rating_B']=train['energy_star_rating'].notna()
train['direction_max_wind_speed_B']=train['direction_max_wind_speed'].notna()
train['direction_peak_wind_speed_B']=train['direction_peak_wind_speed'].notna()
train['max_wind_speed_B']=train['max_wind_speed'].notna()

In [None]:
# test set
test['year_built'].replace(0,np.nan,inplace=True)
test['year_built'].describe
A=test['year_built'].mean()
test['year_built'].fillna(value=A,inplace=True)
test['year_built'].unique()
test['vol']=test['ELEVATION']*test['floor_area']
test['face']=test['ELEVATION']*np.sqrt(test['floor_area'])
test['days_with_fog_B']=test['days_with_fog'].notna()
test['year_built_B']=test['year_built'].notna()
test['energy_star_rating_B']=test['energy_star_rating'].notna()
test['direction_max_wind_speed_B']=test['direction_max_wind_speed'].notna()
test['direction_peak_wind_speed_B']=test['direction_peak_wind_speed'].notna()
test['max_wind_speed_B']=test['max_wind_speed'].notna()

# Dealing with categorical variable (need to merge categories)

In [None]:
train['State_Factor'].value_counts()
train['facility_type'].value_counts()
facility_type_category['facility_type_category'].value_counts()

In [None]:
test['State_Factor'].value_counts()
test['facility_type'].value_counts()
# code the category and compare the distribution

In [None]:
'''
# facility_type
for i in range(0,len(train)):
  if i==facility_type_category['facility_type']:
    train['facility_type_category'] = facility_type_category['facility_type_category']
'''

In [None]:
# train
df1 = pd.DataFrame(train)
df2 = pd.DataFrame(facility_type_category)
train2=pd.merge(df1, df2, on='facility_type', how='outer')
train2

In [None]:
# test
df3 = pd.DataFrame(test)
df2 = pd.DataFrame(facility_type_category)
test2=pd.merge(df3, df2, on='facility_type', how='outer')
test2

# Check the distribution of numerical variables in training data

In [None]:
num = list(train.select_dtypes(include='number').columns)
print(f'There are {len(num)} numerical columns in the dataset')
num


In [None]:
num = ['floor_area','energy_star_rating','ELEVATION', 'cooling_degree_days',
 'heating_degree_days',
 'precipitation_inches',
 'snowfall_inches',
 'snowdepth_inches',
 'avg_temp','direction_max_wind_speed',
 'direction_peak_wind_speed',
 'max_wind_speed',
 'days_with_fog',
 'site_eui','direction_max_wind_speed_B',
 'direction_peak_wind_speed_B',
 'max_wind_speed_B',
 'days_with_fog_B','year_built_B','year_built','vol','face'
 ]

In [None]:
# Check distribution (num)
plt.figure(figsize=(15, 80))
for i, col in enumerate(num):
    # Plot distribution 
    plt.subplot(32,2,i+1); sns.distplot(train[col], color='blue')
    plt.title(f'Distribution of {col}')
# Show the plot
plt.tight_layout()
plt.show()

# transformation + scaling

In [None]:
num_trans = ['floor_area','energy_star_rating','ELEVATION', 'cooling_degree_days',
 'heating_degree_days',
 'precipitation_inches',
 'snowfall_inches',
 'snowdepth_inches',
 'avg_temp',
 'days_with_fog','vol','face'
 ]
for each in num_trans:
    newvar = each+"_log"
    train2[newvar]=np.log(train2[each])
    test2[newvar]=np.log(test2[each])

# Correlation between explanatory variable

In [None]:
def plt_corr(df, figsize ):
    # Create the correlation matrix
    corr = df.corr()

    # Generate a mask for the upper triangle 
    mask = np.triu(np.ones_like(corr, dtype=bool))


    # Add the mask to the heatmap
    plt.figure(figsize=(figsize))
    sns.set(font_scale=1)
    sns.heatmap(corr, mask=mask, cmap=cmap, center=0, linewidths=1, fmt=".2f")

    plt.title('Correlation between numerical features')
    plt.show()

In [None]:
figsize=(100,150)
plt_corr(train[num], figsize)

# Relationship vs target

In [None]:
# Calc conversion rate per group
def calc_EUI(dataframe, column_names=None):
    #print('test')
    if column_names != None:
        # Calc mean EUI
        #print('test')
        mean_EUI = dataframe.groupby(column_names)['site_eui'].mean()  
        #print(mean_EUI)
        # Fill missing values with 0
        mean_EUI = mean_EUI.fillna(0) 
    else:       
        # Conversion rate 
        mean_EUI = dataframe['site_eui'].mean()  

    return round(mean_EUI,2)

original_EUI = calc_EUI(train)
print(f'The average EUI for the full dataset is : {original_EUI}')

In [None]:
# Check distribution (num)
plt.figure(figsize=(15, 66), dpi=80)

for i, col in enumerate(num):
    # Calc conv rate 
    eui = calc_EUI(train, [col])
    #print(eui)
    plt.subplot(22,3,i+1);plt.plot(eui.index, eui);plt.ylabel('EUI') 
    plt.title(f'EUI vs {col}')
    # plot the mean 
    plt.axhline(y=original_EUI, color='orange', linestyle='--', alpha=0.7)
# Show the plot
plt.tight_layout()
plt.show()  

# Resampling (Reweight)

# Model Specification

In [None]:
# ***Add code here to build your predictive models
#Final_cols = ['energy_star_rating','ELEVATION', 'cooling_degree_days',
# 'heating_degree_days',
# 'precipitation_inches',
# 'snowfall_inches',
# 'snowdepth_inches',
# 'avg_temp',
# 'days_with_fog_B','year_built','face','site_eui','State_Factor','facility_type_category']

#Final_col2 = ['energy_star_rating','ELEVATION', 'cooling_degree_days',
 #'heating_degree_days',
 #'precipitation_inches',
 #'snowfall_inches',
 #'snowdepth_inches',
 #'avg_temp',
 #'days_with_fog_B','year_built','face','site_eui','facility_type_category']

#Final_DF=train2[Final_cols]
#Final_DF2=train2[Final_col2]
#Final_DF=Final_DF.dropna()
#Final_DF2=Final_DF2.dropna()

In [None]:
#from imblearn.over_sampling import SMOTE
#smote = SMOTE(random_state=888)
#X_resampled, y_resampled = smote.fit_resample(Final_DF2, Final_DF['State_Factor'])

In [None]:
feature_cols_con = ['energy_star_rating','ELEVATION', 'cooling_degree_days',
 'heating_degree_days',
 'precipitation_inches',
 'snowfall_inches',
 'snowdepth_inches',
 'avg_temp','direction_max_wind_speed',
 'direction_peak_wind_speed',
 'max_wind_speed',
 'days_with_fog',
 'direction_max_wind_speed_B',
 'direction_peak_wind_speed_B',
 'max_wind_speed_B',
 'days_with_fog_B','year_built_B','year_built','face'] # year construction year  'direction_max_wind_speed_B', 'direction_peak_wind_speed_B', 'max_wind_speed_B',
features_con=train2[feature_cols_con]

#features_con = train.iloc[:, c(1,2,4)]
feature_cols_cat = ['building_class']#, 'public_meeting','extraction_type_class','quality_group','quantity_group','source_type','Funder_high','installer_high','Loc_high']
features_cat=train2[feature_cols_cat]
features_cat = pd.get_dummies(features_cat)
features= pd.concat([features_con.reset_index(drop=True), features_cat], axis=1)

# Add code here to build your predictive models
#feature_cols = ['amount_tsh', 'population'] #, 'bmi', 'age','glucose','bp','pedigree']
X = features # Features
y = train2['site_eui'] # Target variable

# prepare test set
features_con=test2[feature_cols_con]
features_cat=test2[feature_cols_cat]
features_cat = pd.get_dummies(features_cat)
features= pd.concat([features_con.reset_index(drop=True), features_cat], axis=1)
X_test = features

In [None]:
MODEL_MAX_DEPTH = 12
MODEL_TASK_TYPE = 'CPU'#'GPU'
MODEL_RL = 0.025
MODEL_EVAL_METRIC ='RMSE'
MODEL_LOSS_FUNCTION = 'RMSE'
MODEL_ESR = 10
MODEL_VERBOSE = 1000
MODEL_ITERATIONS = 28000
SEED = 2022

model = CatBoostRegressor(
    verbose=MODEL_VERBOSE,
    early_stopping_rounds=MODEL_ESR,
    random_seed=SEED,
    max_depth=MODEL_MAX_DEPTH,
    task_type=MODEL_TASK_TYPE,
    learning_rate=MODEL_RL,
    iterations=MODEL_ITERATIONS,
    loss_function=MODEL_LOSS_FUNCTION,
    eval_metric= MODEL_EVAL_METRIC
)
model.fit(X, y)

# Predict and Submission

In [None]:
#submission
pred_test = model.predict(X_test)
sub = pd.DataFrame(test['id'],columns={'id'})
sub['site_eui'] = pred_test
sub.to_csv('submission.csv', index=False)
sub.head()