<a href="https://colab.research.google.com/github/saurabh-parkar/Time-Series-Forecasting-for-Energy-Prediction/blob/master/lgbm_and_classical_time_series.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [0]:
train_df = pd.read_csv("/content/drive/My Drive/Ashrae data/train.csv")

# Remove outliers
train_df = train_df [ train_df['building_id'] != 1099 ]
train_df = train_df.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

building_df = pd.read_csv('/content/drive/My Drive/Ashrae data/building_metadata.csv')
weather_df = pd.read_csv('/content/drive/My Drive/Ashrae data/weather_train.csv')

In [0]:
weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
weather_df["day"] = weather_df["datetime"].dt.day
weather_df["week"] = weather_df["datetime"].dt.week
weather_df["month"] = weather_df["datetime"].dt.month
air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
#weather_df.update(air_temperature_filler,overwrite=False)

In [0]:
def fill_weather_dataset(weather_df):
  
  # Find Missing Dates
  time_format = "%Y-%m-%d %H:%M:%S"
  start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
  end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
  total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
  hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]

  missing_hours = []
  for site_id in range(16):
      site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
      new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
      new_rows['site_id'] = site_id
      weather_df = pd.concat([weather_df,new_rows])

      weather_df = weather_df.reset_index(drop=True)           

  # Add new Features
  weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
  weather_df["day"] = weather_df["datetime"].dt.day
  weather_df["week"] = weather_df["datetime"].dt.week
  weather_df["month"] = weather_df["datetime"].dt.month

  # Reset Index for Fast Update
  weather_df = weather_df.set_index(['site_id','day','month'])

  air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
  weather_df.update(air_temperature_filler,overwrite=False)

  # Step 1
  cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
  # Step 2
  cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])

  weather_df.update(cloud_coverage_filler,overwrite=False)

  due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
  weather_df.update(due_temperature_filler,overwrite=False)

  # Step 1
  sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
  # Step 2
  sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])

  weather_df.update(sea_level_filler,overwrite=False)

  wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
  weather_df.update(wind_direction_filler,overwrite=False)

  wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
  weather_df.update(wind_speed_filler,overwrite=False)

  # Step 1
  precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
  # Step 2
  precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])

  weather_df.update(precip_depth_filler,overwrite=False)

  weather_df = weather_df.reset_index()
  weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
      
  return weather_df

In [0]:
def features_engineering(df):
    
    # Sort by timestamp
    df.sort_values("timestamp")
    df.reset_index(drop=True)
    
    # Add more features
    df["timestamp"] = pd.to_datetime(df["timestamp"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["timestamp"].dt.hour
    df["weekend"] = df["timestamp"].dt.weekday
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                    "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                    "2017-01-02", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                    "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                    "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                    "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                    "2019-01-01"]
    df["is_holiday"] = (df.timestamp.isin(holidays)).astype(int)
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    # Remove Unused Columns
    drop = ["year_built","floor_count"]
    df = df.drop(drop, axis=1)
    gc.collect()
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [0]:
weather_df = fill_weather_dataset(weather_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  from ipykernel import kernelapp as app


In [0]:
weather_df

Unnamed: 0,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,timestamp,wind_direction,wind_speed
0,0,25.000000,6.000000,20.000000,-0.173913,1019.700000,2016-01-01 00:00:00,0.000000,0.000000
1,0,24.400000,4.285714,21.100000,-1.000000,1020.200000,2016-01-01 01:00:00,70.000000,1.500000
2,0,22.800000,2.000000,21.100000,0.000000,1020.200000,2016-01-01 02:00:00,0.000000,0.000000
3,0,21.100000,2.000000,20.600000,0.000000,1020.100000,2016-01-01 03:00:00,0.000000,0.000000
4,0,20.000000,2.000000,20.000000,-1.000000,1020.000000,2016-01-01 04:00:00,250.000000,2.600000
...,...,...,...,...,...,...,...,...,...
140539,15,-3.534783,4.000000,-7.013043,-1.000000,1019.817391,2016-12-09 10:00:00,290.869565,6.565217
140540,15,-5.856522,1.500000,-8.982609,0.066667,1015.613043,2016-12-17 07:00:00,157.391304,5.695652
140541,15,-8.865217,0.000000,-12.730435,3.800000,1032.981818,2016-12-20 06:00:00,135.652174,3.500000
140542,15,2.630435,1.200000,-2.760870,-1.000000,1020.921739,2016-12-24 05:00:00,200.869565,4.847826


In [0]:
train_df = train_df.merge(building_df, left_on='building_id',right_on='building_id',how='left')
train_df = train_df.merge(weather_df,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])
del weather_df
gc.collect()

0

In [0]:
train_df_temp = features_engineering(train_df)

In [0]:
train_df_temp.head(10)

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,weekend,is_holiday
0,105,0,2016-01-01,23.3036,1,0,10.832181,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
1,106,0,2016-01-01,0.3746,1,0,8.589514,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
2,106,3,2016-01-01,0.0,1,0,8.589514,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
3,107,0,2016-01-01,175.184,1,0,11.487946,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
4,108,0,2016-01-01,91.2653,1,0,11.309352,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
5,109,0,2016-01-01,80.93,1,0,10.950736,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
6,109,3,2016-01-01,0.0,1,0,10.950736,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
7,110,0,2016-01-01,86.2283,1,0,10.233331,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
8,111,0,2016-01-01,167.392,1,0,11.681309,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
9,112,0,2016-01-01,10.2748,1,0,10.379939,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1


In [0]:
train_df = train_df_temp.drop(columns="timestamp")

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,weekend,is_holiday
0,105,0,2016-01-01,23.3036,1,0,10.832181,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
1,106,0,2016-01-01,0.3746,1,0,8.589514,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
2,106,3,2016-01-01,0.0,1,0,8.589514,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
3,107,0,2016-01-01,175.184,1,0,11.487946,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
4,108,0,2016-01-01,91.2653,1,0,11.309352,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1


In [0]:
train_df_temp.to_csv('/content/drive/My Drive/Ashrae data/train_tidy_temp.csv')

---

In [0]:
train_df1 = pd.read_csv("C:\\Users\\Sanjan\\Desktop\\Ashrae data\\train_tidy_temp.csv")

---

In [0]:
train_df1.head()

Unnamed: 0.1,Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,weekend,is_holiday
0,0,105,0,2016-01-01 00:00:00,23.3036,1,0,10.832181,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
1,1,106,0,2016-01-01 00:00:00,0.3746,1,0,8.589514,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
2,2,106,3,2016-01-01 00:00:00,0.0,1,0,8.589514,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
3,3,107,0,2016-01-01 00:00:00,175.184,1,0,11.487946,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
4,4,108,0,2016-01-01 00:00:00,91.2653,1,0,11.309352,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1


In [0]:
train_df1 = train_df1.drop('Unnamed: 0',axis=1)
train_df1 = train_df1.drop('timestamp',axis=1)

#train_df1.columns

In [0]:
target = np.log1p(train_df1["meter_reading"])
features = train_df1.drop('meter_reading', axis = 1)
#del train_df
gc.collect()

27

In [0]:
categorical_features = ["building_id", "site_id", "meter", "primary_use", "is_holiday", "weekend"]
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
}

kf = KFold(n_splits=3)
models = []
for train_index,test_index in kf.split(features):
    train_features = features.loc[train_index]
    train_target = target.loc[train_index]
    
    test_features = features.loc[test_index]
    test_target = target.loc[test_index]
    
    d_training = lgb.Dataset(train_features, label=train_target,categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)
    models.append(model)
    del train_features, train_target, test_features, test_target, d_training, d_test
    gc.collect()

Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 1.09098	valid_1's rmse: 1.24354
[50]	training's rmse: 0.88089	valid_1's rmse: 1.12603
[75]	training's rmse: 0.813926	valid_1's rmse: 1.11295
[100]	training's rmse: 0.767996	valid_1's rmse: 1.11261
[125]	training's rmse: 0.736803	valid_1's rmse: 1.11387
Early stopping, best iteration is:
[96]	training's rmse: 0.773661	valid_1's rmse: 1.11217
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 1.09239	valid_1's rmse: 1.21257
[50]	training's rmse: 0.881723	valid_1's rmse: 1.07465
[75]	training's rmse: 0.820672	valid_1's rmse: 1.0512
[100]	training's rmse: 0.78257	valid_1's rmse: 1.04488
[125]	training's rmse: 0.753443	valid_1's rmse: 1.04311
[150]	training's rmse: 0.733083	valid_1's rmse: 1.04351
[175]	training's rmse: 0.7191	valid_1's rmse: 1.04278
[200]	training's rmse: 0.708292	valid_1's rmse: 1.04344
[225]	training's rmse: 0.698643	valid_1's rmse: 1.0437
Early stopping, be

In [0]:
del features, target
gc.collect()

27

In [0]:
test_df = pd.read_csv('/content/drive/My Drive/Ashrae data/test.csv')

In [0]:
row_ids = test_df["row_id"]
test_df.drop("row_id", axis=1, inplace=True)

In [0]:
test_df.head()

Unnamed: 0,building_id,meter,timestamp
0,0,0,2017-01-01 00:00:00
1,1,0,2017-01-01 00:00:00
2,2,0,2017-01-01 00:00:00
3,3,0,2017-01-01 00:00:00
4,4,0,2017-01-01 00:00:00


In [0]:
test_df = test_df.merge(building_df,left_on='building_id',right_on='building_id',how='left')
del building_df
gc.collect()

11

In [0]:
weather_df_test = pd.read_csv('/content/drive/My Drive/Ashrae data/weather_test.csv')
weather_df_test = fill_weather_dataset(weather_df_test)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  from ipykernel import kernelapp as app


In [0]:
weather_df_test.head()

Unnamed: 0,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,timestamp,wind_direction,wind_speed
0,0,17.8,4.0,11.7,0.282609,1021.4,2017-01-01 00:00:00,100.0,3.6
1,0,17.8,2.0,12.8,0.0,1022.0,2017-01-01 01:00:00,130.0,3.1
2,0,16.1,0.0,12.8,0.0,1021.9,2017-01-01 02:00:00,140.0,3.1
3,0,17.2,0.0,13.3,0.0,1022.2,2017-01-01 03:00:00,140.0,3.1
4,0,16.7,2.0,13.3,0.0,1022.3,2017-01-01 04:00:00,130.0,2.6


In [0]:
test_df = test_df.merge(weather_df_test,how='left',on=['timestamp','site_id'])
del weather_df_test


In [0]:
test_df = features_engineering(test_df)

In [0]:
test_df.head()

Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,weekend,is_holiday
0,0,0,2017-01-01,0,0,8.913685,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
1,1,0,2017-01-01,0,0,7.908755,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
2,2,0,2017-01-01,0,0,8.589886,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
3,3,0,2017-01-01,0,0,10.072639,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
4,4,0,2017-01-01,0,0,11.666573,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0


In [0]:
test_df.to_csv('/content/drive/My Drive/Ashrae data/test_tidy_temp.csv')

---

In [0]:
test_df1 = pd.read_csv("C:\\Users\\Sanjan\\Desktop\\Ashrae data\\test_tidy_temp.csv")

---

In [0]:
test_df1.head()

Unnamed: 0.1,Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,weekend,is_holiday
0,0,0,0,2017-01-01 00:00:00,0,0,8.913685,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
1,1,1,0,2017-01-01 00:00:00,0,0,7.908755,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
2,2,2,0,2017-01-01 00:00:00,0,0,8.589886,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
3,3,3,0,2017-01-01 00:00:00,0,0,10.072639,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
4,4,4,0,2017-01-01 00:00:00,0,0,11.666573,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0


In [0]:
test_df_final = test_df1.drop(columns=["timestamp","Unnamed: 0"])
#test_df_final = test_df1.drop(columns="Unnamed: 0")

In [0]:
test_df_final.head()

Unnamed: 0,building_id,meter,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,weekend,is_holiday
0,0,0,0,0,8.913685,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
1,1,0,0,0,7.908755,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
2,2,0,0,0,8.589886,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
3,3,0,0,0,10.072639,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0
4,4,0,0,0,11.666573,17.8,4.0,11.7,0.282609,1021.4,100.0,3.6,0,6,0


In [0]:
results = []
for model in models:
    if  results == []:
        results = np.expm1(model.predict(test_df_final, num_iteration=model.best_iteration)) / len(models)
    else:
        results += np.expm1(model.predict(test_df_final, num_iteration=model.best_iteration)) / len(models)
    del model
    gc.collect()

MemoryError: 

In [0]:
del test_df, models
gc.collect()

In [0]:
results_df = pd.DataFrame({"row_id": row_ids, "meter_reading": np.clip(results, 0, a_max=None)})
del row_ids,results
gc.collect()

In [0]:
results_df.head(20)

In [0]:
---

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
features.head()

Unnamed: 0,building_id,meter,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,weekend,is_holiday
0,105,0,1,0,10.832181,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
1,106,0,1,0,8.589514,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
2,106,3,1,0,8.589514,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
3,107,0,1,0,11.487946,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1
4,108,0,1,0,11.309352,3.8,0.0,2.4,0.0,1020.9,240.0,3.1,0,4,1


In [0]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

In [0]:
categorical_features = ["building_id", "site_id", "meter", "primary_use", "is_holiday", "weekend"]
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
}

kf = KFold(n_splits=2)
models = []
for train_index,test_index in kf.split(X_train):
    train_features = X_train.loc[train_index]
    train_target = y_train.loc[train_index]
    
    test_features = X_train.loc[test_index]
    test_target = y_train.loc[test_index]
    
    d_training = lgb.Dataset(train_features, label=train_target,categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=25)
    models.append(model)
    del train_features, train_target, test_features, test_target, d_training, d_test
    gc.collect()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  from ipykernel import kernelapp as app
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  app.launch_new_instance()
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://panda

Training until validation scores don't improve for 25 rounds
[25]	training's rmse: 1.00587	valid_1's rmse: 1.31641
[50]	training's rmse: 0.671074	valid_1's rmse: 1.13378
[75]	training's rmse: 0.600446	valid_1's rmse: 1.09624
[100]	training's rmse: 0.56345	valid_1's rmse: 1.07481
[125]	training's rmse: 0.539268	valid_1's rmse: 1.06359
[150]	training's rmse: 0.521301	valid_1's rmse: 1.05889
[175]	training's rmse: 0.507554	valid_1's rmse: 1.05715
[200]	training's rmse: 0.497924	valid_1's rmse: 1.05653
[225]	training's rmse: 0.489873	valid_1's rmse: 1.05625
[250]	training's rmse: 0.482602	valid_1's rmse: 1.05577
[275]	training's rmse: 0.476801	valid_1's rmse: 1.05538
Early stopping, best iteration is:
[273]	training's rmse: 0.477337	valid_1's rmse: 1.05537
Training until validation scores don't improve for 25 rounds
[25]	training's rmse: 0.990594	valid_1's rmse: 1.29745
[50]	training's rmse: 0.650505	valid_1's rmse: 1.10654
[75]	training's rmse: 0.578102	valid_1's rmse: 1.07649
[100]	train

In [0]:
results = []
for model in models:
    if  results == []:
        results = np.expm1(model.predict(X_test, num_iteration=model.best_iteration)) / len(models)
    else:
        results += np.expm1(model.predict(X_test, num_iteration=model.best_iteration)) / len(models)
    #del model
    gc.collect()

  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
ypred = model.predict(X_test)

In [0]:
X_test.head()

Unnamed: 0,building_id,meter,site_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,weekend,is_holiday
8019080,1157,1,13,0,10.521588,18.9,2.6,16.1,8.0,1013.6,137.727273,2.1,21,4,0
4188120,1400,1,15,4,9.960293,15.0,4.0,4.4,-1.0,1012.0,160.0,8.2,16,3,0
1871015,410,0,3,0,11.068418,2.8,5.230769,-3.3,-1.0,1022.4,330.0,9.3,13,4,0
9994823,246,3,2,6,12.218179,41.1,2.0,7.2,0.0,1006.7,280.0,5.1,1,5,0
4935167,187,1,2,0,10.696571,26.1,6.5,5.0,0.0,1008.3,330.0,4.1,4,4,0


In [0]:
ypred1 = pd.DataFrame(ypred)

In [0]:
results_df = pd.DataFrame(np.clip(results, 0, a_max=None))
#del row_ids,results
gc.collect()

150

In [0]:
results

array([ 101.81443961,   20.16503409,   84.23674461, ...,   15.8572591 ,
         91.45562228, 1032.99685354])

In [0]:
pd.DataFrame(y_test.head(10))

Unnamed: 0,meter_reading
8019080,4.952491
4188120,2.853898
1871015,4.460722
9994823,4.454238
4935167,2.482621
14844812,4.541911
5191104,3.610918
19693720,0.255882
7131039,4.945207
12976929,4.184181


In [0]:
results_df.head(20)

Unnamed: 0,0
0,101.81444
1,20.165034
2,84.236745
3,91.553787
4,3.35734
5,84.069351
6,34.922003
7,3.442654
8,93.987892
9,48.600104


In [0]:
ypred1.head(10)

Unnamed: 0,0
0,4.229118
1,3.18498
2,4.334928
3,4.656039
4,0.965189
5,4.444804
6,3.612776
7,0.161807
8,4.590721
9,3.728169


In [0]:
y_test.head(10)

8019080     4.952491
4188120     2.853898
1871015     4.460722
9994823     4.454238
4935167     2.482621
14844812    4.541911
5191104     3.610918
19693720    0.255882
7131039     4.945207
12976929    4.184181
Name: meter_reading, dtype: float64

In [0]:
from sklearn.metrics import mean_squared_error

In [0]:
print("RMSE for LGBM ",mean_squared_error(y_test, ypred1))
print("MAE for LGBM ",mean_absolute_error(y_test,ypred1))

RMSE for LGBM  1.2141314687001845
MAE for LGBM  0.5525333636964069


In [0]:
metadata = pd.read_csv("C:\\Users\\Sanjan\\Desktop\\Ashrae data\\building_metadata.csv")

In [0]:
metadata.head(10)

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,
5,0,5,Education,8000,2000.0,
6,0,6,Lodging/residential,27926,1981.0,
7,0,7,Education,121074,1989.0,
8,0,8,Education,60809,2003.0,
9,0,9,Office,27000,2010.0,


In [0]:
weather = pd.read_csv("C:\\Users\\Sanjan\\Desktop\\Ashrae data\\weather_train.csv")

In [0]:
weather.head(10)

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6
5,0,2016-01-01 05:00:00,19.4,,19.4,0.0,,0.0,0.0
6,0,2016-01-01 06:00:00,21.1,6.0,21.1,-1.0,1019.4,0.0,0.0
7,0,2016-01-01 07:00:00,21.1,,21.1,0.0,1018.8,210.0,1.5
8,0,2016-01-01 08:00:00,20.6,,20.0,0.0,1018.1,0.0,0.0
9,0,2016-01-01 09:00:00,21.1,,20.6,0.0,1019.0,290.0,1.5


In [0]:
train = pd.read_csv("C:\\Users\\Sanjan\\Desktop\\Ashrae data\\train.csv")

In [0]:
train.head(10)

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,1,0,2016-01-01 00:00:00,0.0
2,2,0,2016-01-01 00:00:00,0.0
3,3,0,2016-01-01 00:00:00,0.0
4,4,0,2016-01-01 00:00:00,0.0
5,5,0,2016-01-01 00:00:00,0.0
6,6,0,2016-01-01 00:00:00,0.0
7,7,0,2016-01-01 00:00:00,0.0
8,8,0,2016-01-01 00:00:00,0.0
9,9,0,2016-01-01 00:00:00,0.0


In [0]:
from sklearn.linear_model import LinearRegression

In [0]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,mean_squared_log_error

In [0]:
reg = LinearRegression().fit(X_train, y_train)

In [0]:
linear_reg_pred = reg.predict(X_test)

In [0]:
print("RMSE using Linear Regression",mean_squared_error(y_test,linear_reg_pred))
print("MAE using Linear Regression",mean_absolute_error(y_test,linear_reg_pred))

RMSE using Linear Regression 3.386148944672328
MAE using Linear Regression 1.3331572009259258


In [0]:
from sklearn.tree import DecisionTreeRegressor

In [0]:
regfit = DecisionTreeRegressor().fit(X_train,y_train)

In [0]:
dt_pred = regfit.predict(X_test)

In [0]:
print("RMSE using Decision Tree Regressor",mean_squared_error(y_test,dt_pred))
print("MAE using Decision Tree Regressor",mean_absolute_error(y_test,dt_pred))

RMSE using Decision Tree Regressor 0.7987171738338351
MAE using Decision Tree Regressor 0.3010360468774003
