In [1]:
import pandas as pd
import lightgbm  as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

dir_loc = '/media/td/Samsung_T5/kaggle_data/ASHRAE_2019-10-15/ashrae-energy-prediction'
building_metadata_file_name = 'building_metadata.csv'
sample_submission_file_name = 'sample_submission.csv'
test_file_name = 'test.csv'
train_file_name = 'train.csv'
weather_test_file_name = 'weather_test.csv'
weather_train_file_name = 'weather_train.csv'

## File characteristics

In [2]:
building_metadata_df = pd.read_csv(f'{dir_loc}/{building_metadata_file_name}')

In [3]:
building_metadata_df.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [4]:
building_metadata_df.shape

(1449, 6)

In [5]:
building_metadata_df.columns

Index(['site_id', 'building_id', 'primary_use', 'square_feet', 'year_built',
       'floor_count'],
      dtype='object')

In [6]:
building_metadata_df['site_id'].value_counts()

3     274
13    154
2     135
15    124
9     124
0     105
14    102
4      91
5      89
8      70
1      51
6      44
12     36
10     30
7      15
11      5
Name: site_id, dtype: int64

In [7]:
building_metadata_df['building_id'].value_counts()

1448    1
475     1
477     1
478     1
479     1
       ..
966     1
967     1
968     1
969     1
0       1
Name: building_id, Length: 1449, dtype: int64

In [8]:
building_metadata_df['primary_use'].value_counts(normalize = True)

Education                        0.378882
Office                           0.192547
Entertainment/public assembly    0.126984
Public services                  0.107660
Lodging/residential              0.101449
Other                            0.017253
Healthcare                       0.015873
Parking                          0.015183
Warehouse/storage                0.008972
Manufacturing/industrial         0.008282
Retail                           0.007591
Services                         0.006901
Technology/science               0.004141
Food sales and service           0.003451
Utility                          0.002761
Religious worship                0.002070
Name: primary_use, dtype: float64

In [9]:
building_metadata_df['square_feet'].describe()

count      1449.000000
mean      92111.776398
std      110769.950997
min         283.000000
25%       23012.000000
50%       57673.000000
75%      115676.000000
max      875000.000000
Name: square_feet, dtype: float64

In [10]:
building_metadata_df['year_built'].describe()

count     675.000000
mean     1967.957037
std        31.054030
min      1900.000000
25%      1949.000000
50%      1970.000000
75%      1995.000000
max      2017.000000
Name: year_built, dtype: float64

In [11]:
building_metadata_df['floor_count'].describe()

count    355.000000
mean       3.740845
std        3.333683
min        1.000000
25%        1.000000
50%        3.000000
75%        5.000000
max       26.000000
Name: floor_count, dtype: float64

In [12]:
building_metadata_df.isnull().sum(axis = 0) / building_metadata_df.shape[0]

site_id        0.000000
building_id    0.000000
primary_use    0.000000
square_feet    0.000000
year_built     0.534161
floor_count    0.755003
dtype: float64

In [13]:
sample_submission_df = pd.read_csv(f'{dir_loc}/{sample_submission_file_name}')
sample_submission_df.head()

Unnamed: 0,row_id,meter_reading
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [14]:
test_df = pd.read_csv(f'{dir_loc}/{test_file_name}')
train_df = pd.read_csv(f'{dir_loc}/{train_file_name}')

train_df['train_test'] = 'train'
test_df['train_test'] = 'test'
train_test_df = pd.concat([test_df, train_df])
train_test_df['building_id'] = train_test_df['building_id'].astype(np.int16)
train_test_df['meter'] = train_test_df['meter'].astype(np.int8)

del test_df, train_df

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [15]:
weather_test_df = pd.read_csv(f'{dir_loc}/{weather_test_file_name}')
weather_test_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2017-01-01 00:00:00,17.8,4.0,11.7,,1021.4,100.0,3.6
1,0,2017-01-01 01:00:00,17.8,2.0,12.8,0.0,1022.0,130.0,3.1
2,0,2017-01-01 02:00:00,16.1,0.0,12.8,0.0,1021.9,140.0,3.1
3,0,2017-01-01 03:00:00,17.2,0.0,13.3,0.0,1022.2,140.0,3.1
4,0,2017-01-01 04:00:00,16.7,2.0,13.3,0.0,1022.3,130.0,2.6


In [16]:
weather_train_df = pd.read_csv(f'{dir_loc}/{weather_train_file_name}')
weather_train_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [17]:
weather_test_df.describe()

Unnamed: 0,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
count,277243.0,277139.0,136795.0,276916.0,181655.0,255978.0,264873.0,276783.0
mean,7.484654,14.276863,2.187317,7.487658,1.095395,1016.195762,180.426023,3.548873
std,4.616959,10.741605,2.620627,10.153973,9.163511,7.940392,110.456872,2.317184
min,0.0,-28.1,0.0,-31.6,-1.0,972.0,0.0,0.0
25%,3.0,7.2,0.0,0.6,0.0,1011.6,80.0,2.1
50%,7.0,15.0,2.0,8.4,0.0,1016.3,190.0,3.1
75%,12.0,22.2,4.0,15.0,0.0,1020.8,280.0,4.6
max,15.0,48.3,9.0,26.7,597.0,1050.1,360.0,24.2


In [18]:
weather_train_df.describe()

Unnamed: 0,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
count,139773.0,139718.0,70600.0,139660.0,89484.0,129155.0,133505.0,139469.0
mean,7.478977,14.418106,2.149306,7.350158,0.983047,1016.158038,180.526632,3.560527
std,4.604744,10.626595,2.59915,9.790235,8.463678,7.629684,111.523629,2.335874
min,0.0,-28.9,0.0,-35.0,-1.0,968.2,0.0,0.0
25%,3.0,7.2,0.0,0.6,0.0,1011.8,80.0,2.1
50%,7.0,15.0,2.0,8.3,0.0,1016.4,190.0,3.1
75%,11.0,22.2,4.0,14.4,0.0,1020.8,280.0,5.0
max,15.0,47.2,9.0,26.1,343.0,1045.5,360.0,19.0


In [19]:
weather_df = pd.concat([weather_train_df, weather_test_df])

In [20]:
weather_df.isnull().sum(axis = 0) / weather_df.shape[0]

site_id               0.000000
timestamp             0.000000
air_temperature       0.000381
cloud_coverage        0.502669
dew_temperature       0.001055
precip_depth_1_hr     0.349812
sea_level_pressure    0.076455
wind_direction        0.044694
wind_speed            0.001832
dtype: float64

In [21]:
weather_train_df.isnull().sum(axis = 0) / weather_train_df.shape[0]

site_id               0.000000
timestamp             0.000000
air_temperature       0.000393
cloud_coverage        0.494895
dew_temperature       0.000808
precip_depth_1_hr     0.359791
sea_level_pressure    0.075966
wind_direction        0.044844
wind_speed            0.002175
dtype: float64

In [22]:
weather_test_df.isnull().sum(axis = 0) / weather_test_df.shape[0]

site_id               0.000000
timestamp             0.000000
air_temperature       0.000375
cloud_coverage        0.506588
dew_temperature       0.001179
precip_depth_1_hr     0.344781
sea_level_pressure    0.076702
wind_direction        0.044618
wind_speed            0.001659
dtype: float64

In [23]:
del weather_test_df, weather_train_df

In [24]:
weather_df['air_temperature'].describe()

count    416857.000000
mean         14.324204
std          10.703390
min         -28.900000
25%           7.200000
50%          15.000000
75%          22.200000
max          48.300000
Name: air_temperature, dtype: float64

In [25]:
weather_df['cloud_coverage'].describe()

count    207395.000000
mean          2.174377
std           2.613391
min           0.000000
25%           0.000000
50%           2.000000
75%           4.000000
max           9.000000
Name: cloud_coverage, dtype: float64

In [26]:
weather_df[weather_df['cloud_coverage'].isnull()].describe()

Unnamed: 0,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
count,209621.0,209482.0,0.0,209318.0,116077.0,187854.0,202752.0,209231.0
mean,7.280988,12.67867,,7.494326,2.236714,1015.596414,184.63151,3.714822
std,4.539802,10.636721,,10.27656,12.7827,7.849045,107.925446,2.301223
min,0.0,-28.8,,-35.0,-1.0,972.0,0.0,0.0
25%,4.0,5.6,,0.6,0.0,1011.1,90.0,2.1
50%,7.0,13.5,,8.9,0.0,1015.8,190.0,3.6
75%,11.0,20.6,,15.0,0.0,1020.3,280.0,5.1
max,15.0,46.7,,26.7,597.0,1046.5,360.0,21.6


In [27]:
weather_df[~weather_df['cloud_coverage'].isnull()].describe()

Unnamed: 0,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
count,207395.0,207375.0,207395.0,207258.0,155062.0,197279.0,195626.0,207021.0
mean,7.68668,15.986456,2.174377,7.38827,0.176188,1016.741779,176.136004,3.389003
std,4.676727,10.511988,2.613391,9.782034,3.947172,7.785679,113.571794,2.334338
min,0.0,-28.9,0.0,-31.7,-1.0,968.2,0.0,0.0
25%,3.0,8.9,0.0,0.6,0.0,1012.3,80.0,2.1
50%,8.0,16.1,2.0,7.9,0.0,1016.8,180.0,3.1
75%,12.0,23.9,4.0,14.4,0.0,1021.3,280.0,4.6
max,15.0,48.3,9.0,26.7,442.0,1050.1,360.0,24.2


Weather is warmer where cloud coverage is null. It is possible that cloud coverage is not included in clear sky weather. Filling nans with 0s for now + including a flag.

In [28]:
weather_df['dew_temperature'].describe()

count    416576.000000
mean          7.441560
std          10.033695
min         -35.000000
25%           0.600000
50%           8.300000
75%          15.000000
max          26.700000
Name: dew_temperature, dtype: float64

In [29]:
weather_df['wind_direction']

0           0.0
1          70.0
2           0.0
3           0.0
4         250.0
          ...  
277238    150.0
277239    140.0
277240    140.0
277241    140.0
277242    140.0
Name: wind_direction, Length: 417016, dtype: float64

In [30]:
weather_df[weather_df['precip_depth_1_hr'].isnull()].describe()

Unnamed: 0,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
count,145877.0,145843.0,52333.0,145748.0,0.0,118217.0,144378.0,145836.0
mean,8.162562,10.017472,3.154854,5.484872,,1016.008018,202.635221,4.121204
std,4.662359,9.213586,3.193189,8.718865,,9.500688,98.301414,2.423031
min,0.0,-28.8,0.0,-35.0,,968.2,0.0,0.0
25%,5.0,4.7,0.0,1.0,,1010.7,120.0,2.6
50%,7.0,10.6,2.0,7.0,,1016.6,220.0,3.6
75%,12.0,16.2,7.0,11.7,,1022.2,280.0,5.7
max,15.0,39.4,9.0,25.6,,1045.5,360.0,23.0


In [31]:
weather_df[~weather_df['precip_depth_1_hr'].isnull()].describe()

Unnamed: 0,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
count,271139.0,271014.0,155062.0,270828.0,271139.0,266916.0,254000.0,270416.0
mean,7.117003,16.64182,1.843469,8.494566,1.058317,1016.26066,167.854811,3.246224
std,4.544138,10.732103,2.293401,10.524978,8.938746,6.974033,115.452008,2.208037
min,0.0,-28.9,0.0,-31.7,-1.0,973.2,0.0,0.0
25%,3.0,9.4,0.0,0.6,0.0,1012.0,60.0,2.1
50%,8.0,17.8,2.0,10.0,0.0,1016.2,170.0,3.1
75%,10.0,24.4,4.0,17.2,0.0,1020.3,270.0,4.6
max,15.0,48.3,9.0,26.7,597.0,1050.1,360.0,24.2


Not sure how to input precip_depth_1_hr. It is missing in colder weather. Might be invalid in snowy conditions. I will fill with median and include a flag but that seems incorrect. I should make a sample of weather_df with otherwise similar characteristics to the nan values and fill with the median of that sample.


In [32]:
train_test_df.groupby('building_id')['meter'].count()

building_id
0       26304
1       26304
2       26304
3       26304
4       26304
        ...  
1444    24965
1445    24969
1446    24992
1447    24991
1448    24972
Name: meter, Length: 1449, dtype: int64

In [33]:
#train_test_df.groupby('building_id')['meter'].nunique().sort_values('building_id', ascending = False)

In [34]:
#multiple_meters_df = train_test_df[train_test_df.duplicated(subset=['building_id', 'timestamp'], keep=False)]

In [35]:
#multiple_meters_df

In [36]:
#multiple_meters_df.sort_values(['building_id', 'timestamp']).head(10)

In [37]:
weather_df.columns

Index(['site_id', 'timestamp', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed'],
      dtype='object')

## Data processing

In [38]:
building_metadata_df['floor_count'] = building_metadata_df['floor_count'].fillna(building_metadata_df['floor_count'].median())
building_metadata_df['year_built'] = building_metadata_df['year_built'].fillna(building_metadata_df['year_built'].mean())


weather_df = weather_df.sort_values(['site_id', 'timestamp'])

weather_df['air_temperature'] = weather_df['air_temperature'].fillna(method='ffill')
weather_df['dew_temperature'] = weather_df['dew_temperature'].fillna(method='ffill')
weather_df['wind_speed'] = weather_df['wind_speed'].fillna(method='ffill')

weather_df['air_temperature'] = weather_df['air_temperature'].fillna(weather_df['air_temperature'].median())
weather_df['cloud_coverage_present_flag'] = weather_df['cloud_coverage'].apply(lambda x: 0 if pd.isna(x) else 1)
weather_df['cloud_coverage'] = weather_df['cloud_coverage'].fillna(0)
weather_df['dew_temperature'] = weather_df['dew_temperature'].fillna(weather_df['dew_temperature'].median())
weather_df['precip_depth_1_hr_present_flag'] = weather_df['precip_depth_1_hr'].apply(lambda x: 0 if pd.isna(x) else 1)
weather_df['precip_depth_1_hr'] = weather_df['precip_depth_1_hr'].fillna(weather_df['precip_depth_1_hr'].median())
weather_df['wind_direction_present_flag'] = weather_df['wind_direction'].apply(lambda x: 0 if pd.isna(x) else 1)
weather_df['wind_speed'] = weather_df['wind_speed'].fillna(weather_df['wind_speed'].median())

In [39]:
import math

weather_df['wind_direction_north'] = weather_df['wind_direction'].apply(lambda x:  math.sin(math.radians(x)) if not pd.isna(x) else x)
weather_df['wind_direction_east'] = weather_df['wind_direction'].apply(lambda x:  math.sin(math.radians(x)) if not pd.isna(x) else x)

weather_df['wind_direction'] = weather_df['wind_direction'].fillna(0)
weather_df['wind_direction_north'] = weather_df['wind_direction_north'].fillna(0)
weather_df['wind_direction_east'] = weather_df['wind_direction_east'].fillna(0)


In [40]:
primary_use_le = LabelEncoder()
building_metadata_df['primary_use'] = primary_use_le.fit_transform(building_metadata_df['primary_use'])


In [41]:
train_test_df['timestamp_dt'] = pd.to_datetime(train_test_df['timestamp'])


In [42]:
train_test_df['hour_of_day'] = train_test_df['timestamp_dt'].dt.hour
train_test_df['day_of_week'] = train_test_df['timestamp_dt'].dt.dayofweek
train_test_df['month_of_year'] = train_test_df['timestamp_dt'].dt.month
train_test_df['day_of_month'] = train_test_df['timestamp_dt'].dt.day
train_test_df = train_test_df.drop('timestamp_dt', axis = 1)



In [43]:
train_test_df = train_test_df.merge(building_metadata_df, how='left')
del building_metadata_df
train_test_df = train_test_df.merge(weather_df, how='left')
del weather_df

In [44]:
train_test_df.shape

(61913700, 27)

In [45]:
train_test_df.columns

Index(['building_id', 'meter', 'meter_reading', 'row_id', 'timestamp',
       'train_test', 'hour_of_day', 'day_of_week', 'month_of_year',
       'day_of_month', 'site_id', 'primary_use', 'square_feet', 'year_built',
       'floor_count', 'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'cloud_coverage_present_flag',
       'precip_depth_1_hr_present_flag', 'wind_direction_present_flag',
       'wind_direction_north', 'wind_direction_east'],
      dtype='object')

## Time series analysis

Can time series meter values be used for test data?

In [46]:
timeseries_columns = ['air_temperature', 'cloud_coverage', 'dew_temperature', 
                      'precip_depth_1_hr', 'sea_level_pressure', 'wind_speed', 'wind_direction_north', 'wind_direction_east']

def n_day_func(df, n, func = 'avg'):
    if func == 'avg':
        df_group = df.groupby(['building_id', 'meter']).rolling(window=n)[timeseries_columns].mean().fillna(method='ffill')
    if func == 'std':
        df_group = df.groupby(['building_id', 'meter']).rolling(window=n)[timeseries_columns].std().fillna(method='ffill')
    if func == 'diff':
        df_group = df[timeseries_columns + ['building_id', 'meter', 'timestamp']]
        df_group = df_group.sort_values(by=['building_id', 'meter', 'timestamp'])
        df_group = df[timeseries_columns].diff(periods = n).fillna(method='ffill')
  
    df_group = df_group.fillna(df_group.median()).astype('float32')
    
    df_group.columns = [f"f_{n}_hour_{func}_" + str(i) for i in df_group.columns]
    df_group.index = df.index
    df_group = df.join(df_group)
    return df_group
    
    

In [47]:
train_test_df = train_test_df.sort_values('timestamp')
train_test_df.shape

(61913700, 27)

In [48]:
import gc
gc.collect()

40

In [49]:
train_test_df = n_day_func(train_test_df, 24, func = 'std')
gc.collect()

0

In [50]:
for i in [-7*24, -24, -1, 1, 24, 7*24]:
    print(i)
    train_test_df = n_day_func(train_test_df, i, func = 'diff')
    gc.collect()

-168
-24
-1
1
24
168


In [51]:
train_test_df.columns

Index(['building_id', 'meter', 'meter_reading', 'row_id', 'timestamp',
       'train_test', 'hour_of_day', 'day_of_week', 'month_of_year',
       'day_of_month', 'site_id', 'primary_use', 'square_feet', 'year_built',
       'floor_count', 'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'cloud_coverage_present_flag',
       'precip_depth_1_hr_present_flag', 'wind_direction_present_flag',
       'wind_direction_north', 'wind_direction_east',
       'f_24_hour_std_air_temperature', 'f_24_hour_std_cloud_coverage',
       'f_24_hour_std_dew_temperature', 'f_24_hour_std_precip_depth_1_hr',
       'f_24_hour_std_sea_level_pressure', 'f_24_hour_std_wind_speed',
       'f_24_hour_std_wind_direction_north',
       'f_24_hour_std_wind_direction_east', 'f_-168_hour_diff_air_temperature',
       'f_-168_hour_diff_cloud_coverage', 'f_-168_hour_diff_dew_temperature',
       'f_-168_hour_diff_precip_depth_

In [52]:
# train_df['f_3_hour_avg_air_temperature'].describe()

In [53]:
train_test_df.columns

Index(['building_id', 'meter', 'meter_reading', 'row_id', 'timestamp',
       'train_test', 'hour_of_day', 'day_of_week', 'month_of_year',
       'day_of_month', 'site_id', 'primary_use', 'square_feet', 'year_built',
       'floor_count', 'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'cloud_coverage_present_flag',
       'precip_depth_1_hr_present_flag', 'wind_direction_present_flag',
       'wind_direction_north', 'wind_direction_east',
       'f_24_hour_std_air_temperature', 'f_24_hour_std_cloud_coverage',
       'f_24_hour_std_dew_temperature', 'f_24_hour_std_precip_depth_1_hr',
       'f_24_hour_std_sea_level_pressure', 'f_24_hour_std_wind_speed',
       'f_24_hour_std_wind_direction_north',
       'f_24_hour_std_wind_direction_east', 'f_-168_hour_diff_air_temperature',
       'f_-168_hour_diff_cloud_coverage', 'f_-168_hour_diff_dew_temperature',
       'f_-168_hour_diff_precip_depth_

## Baseline model

In [54]:
invalid_features = ['train_test', 'meter_reading', 'timestamp', 'row_id']
feature_cols = [i for i in train_test_df.columns if i not in invalid_features]
target = 'meter_reading'


In [55]:
train_df = train_test_df[train_test_df['train_test'] == 'train']
train_test_df = train_test_df[train_test_df['train_test'] == 'test']
test_df = train_test_df.copy()
del train_test_df

In [56]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    "learning_rate": 0.1,
    "num_leaves":256
}
num_boost_round = 1000000
early_stopping_rounds = 10



split_train_df, split_val_df = train_test_split(train_df, random_state = 1)

lgtrain = lgb.Dataset(split_train_df[feature_cols], split_train_df[target])
lgvalid = lgb.Dataset(split_val_df[feature_cols], split_val_df[target])

model = lgb.train(lgbm_params, lgtrain,
                 valid_sets=[lgtrain, lgvalid],
                 num_boost_round=num_boost_round,
                 valid_names=['train', 'valid'],
                 early_stopping_rounds=early_stopping_rounds,
                 verbose_eval=10)

test_df['meter_reading'] = model.predict(test_df[feature_cols], num_iteration=model.best_iteration)
test_df = test_df[['row_id', 'meter_reading']]
test_df['row_id'] = test_df['row_id'].astype('int32')
test_df = test_df.sort_values('row_id')
test_df['meter_reading'] = test_df['meter_reading'].apply(lambda x: max(x, 0))
test_df.to_csv(f'{dir_loc}/preds.csv.zip', index = False, compression = 'zip')

fi = model.feature_importance(iteration=model.best_iteration, importance_type='gain')
feature_importance_df = pd.DataFrame.from_dict([{'column': i, 'feature_importance':j} for i, j in zip(feature_cols, fi)])

Training until validation scores don't improve for 10 rounds
[10]	train's l2: 3.71768e+09	valid's l2: 3.55302e+09
[20]	train's l2: 9.02197e+08	valid's l2: 1.28182e+09
[30]	train's l2: 4.02939e+08	valid's l2: 9.46725e+08
[40]	train's l2: 2.56279e+08	valid's l2: 8.68018e+08
[50]	train's l2: 1.83873e+08	valid's l2: 8.48612e+08
[60]	train's l2: 1.44798e+08	valid's l2: 8.37954e+08
[70]	train's l2: 1.22036e+08	valid's l2: 8.25431e+08
[80]	train's l2: 1.08232e+08	valid's l2: 8.22029e+08
[90]	train's l2: 9.90828e+07	valid's l2: 8.18648e+08
[100]	train's l2: 9.3997e+07	valid's l2: 8.16871e+08
[110]	train's l2: 8.84816e+07	valid's l2: 8.13906e+08
[120]	train's l2: 8.56132e+07	valid's l2: 8.134e+08
[130]	train's l2: 8.25165e+07	valid's l2: 8.13243e+08
[140]	train's l2: 7.9712e+07	valid's l2: 8.12175e+08
[150]	train's l2: 7.45513e+07	valid's l2: 8.11135e+08
[160]	train's l2: 7.19887e+07	valid's l2: 8.09614e+08
[170]	train's l2: 7.11067e+07	valid's l2: 8.09358e+08
[180]	train's l2: 6.96598e+07	vali

In [57]:
feature_importance_df.sort_values('feature_importance', ascending = False).head(50)

Unnamed: 0,column,feature_importance
4,month_of_year,9.767871e+17
1,meter,2.644581e+17
0,building_id,1.990903e+17
5,day_of_month,1.235557e+17
13,dew_temperature,1.148393e+17
8,square_feet,6.650722e+16
3,day_of_week,3.4741e+16
11,air_temperature,2.355488e+16
15,sea_level_pressure,1.621491e+16
2,hour_of_day,1.465659e+16
