In [1]:
import pandas as pd
import numpy as np
import pickle
import datetime as dt
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import gc
from numpy import percentile
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing
import pickle
import lightgbm as lgb
from zipfile import ZipFile
from IPython.display import clear_output
import pdb

In [2]:
# extracting all the files
with ZipFile('/content/drive/MyDrive/ashrae-energy-prediction.zip', 'r') as zipObj:
  zipObj.extractall('energy_prediction')

In [3]:
#loading the csv files into dataframe
test=pd.read_csv('/content/energy_prediction/train.csv')
building_metadata=pd.read_csv('/content/energy_prediction/building_metadata.csv')
weather_test=pd.read_csv('/content/energy_prediction/weather_train.csv')

In [4]:
#loading the trained lgbm model
with open('/content/drive/MyDrive/lgbm_model.pkl','rb') as f:
  lgbm_model=pickle.load(f)

In [38]:
''' This function takes three dataframes as arguments and returns predictions for the test data '''

def final_func_1(test,building_metadata,weather_test):  


  def time_alignment(df):
    ''' This function aligns the timestamp '''

    temp_df=df[['site_id','timestamp','air_temperature']]

    # calculate ranks of hourly temperatures within date/site_id chunks
    temp_df['temp_rank']=temp_df.groupby(['site_id', temp_df.timestamp.dt.date],)['air_temperature'].rank('average')

    # create 2D dataframe of site_ids (0-16) x mean hour rank of temperature within day (0-23)
    df_2d=temp_df.groupby(['site_id', temp_df.timestamp.dt.hour])['temp_rank'].mean().unstack(level=1)

    # align scale, so each value within row is in [0,1] range
    df_2d = df_2d / df_2d.max(axis=1).values.reshape((-1,1))

    # assuming site ids (1,5,12) has the most correct temp peaks at 14:00
    site_ids_offsets= pd.Series(df_2d.values.argmax(axis=1) - 14)

    temp_df['offset'] = temp_df.site_id.map(site_ids_offsets)

    # add offset
    temp_df['timestamp_aligned'] = (temp_df.timestamp - pd.to_timedelta(temp_df.offset, unit='H'))
  
    # replace the timestamp with aligned timestamps in the original dataframe
    df['timestamp']=temp_df['timestamp_aligned']

    return df 

  #merging test and building_metadata
  merge_df=building_metadata.merge(test,on='building_id',how='right')
  
  #converting timestamp to datetime format
  weather_test['timestamp']= pd.to_datetime(weather_test['timestamp'],infer_datetime_format=True)
  merge_df['timestamp']=pd.to_datetime(merge_df['timestamp'],infer_datetime_format=True)
  
  weather_test=time_alignment(weather_test)
  
  #merging waether_test and merge_df
  merge_df=merge_df.merge(weather_test,on=['site_id','timestamp'],how='left')

  #removing features with high nan values and unimportant features
  merge_df=merge_df.drop(['cloud_coverage','precip_depth_1_hr','floor_count','year_built','site_id','wind_direction','sea_level_pressure','row_id'],axis=1)

  #converting square_feet to log values
  merge_df['square_feet'] = np.log1p(merge_df['square_feet'])


  # adding month, day, and hour feature
  merge_df['Month']=pd.DatetimeIndex(merge_df['timestamp']).month
  merge_df['Day']=pd.DatetimeIndex(merge_df['timestamp']).day
  merge_df['Hour']=pd.DatetimeIndex(merge_df['timestamp']).hour

  #one hot encoding primary_use feature
  merge_df=pd.get_dummies(merge_df, columns=['primary_use'])

  #adding dayofweek and season
  merge_df["day_of_week"]=np.uint8(merge_df['timestamp'].dt.dayofweek)
  merge_df['season']= merge_df['Month'].apply(lambda x: 'Spring' if x==3 or x==4 or x==5 else 'Summer' if 
                                                x==6 or x==7 or x==8 
                                                else 'Autumn' if x==9 or x==10 or 
                                                x==11 else 'Winter')
  
  # one hot encoding season feature
  merge_df=pd.get_dummies(merge_df, columns=['season'])

  merge_df=merge_df.drop(['timestamp'],axis=1)

  #predicting test data
  test_pred=np.round(lgbm_model.predict(merge_df),4)

  return list(test_pred)



In [39]:
''' This function takes three dataframes and target variable as arguments and returns RMSLE '''

def final_func_2(test,building_metadata,weather_test,y_actual): 


  def time_alignment(df):
    ''' This function aligns the timestamp '''

    temp_df=df[['site_id','timestamp','air_temperature']]

    # calculate ranks of hourly temperatures within date/site_id chunks
    temp_df['temp_rank']=temp_df.groupby(['site_id', temp_df.timestamp.dt.date],)['air_temperature'].rank('average')

    # create 2D dataframe of site_ids (0-16) x mean hour rank of temperature within day (0-23)
    df_2d=temp_df.groupby(['site_id', temp_df.timestamp.dt.hour])['temp_rank'].mean().unstack(level=1)

    # align scale, so each value within row is in [0,1] range
    df_2d = df_2d / df_2d.max(axis=1).values.reshape((-1,1))

    # assuming site ids (1,5,12) has the most correct temp peaks at 14:00
    site_ids_offsets= pd.Series(df_2d.values.argmax(axis=1) - 14)

    temp_df['offset'] = temp_df.site_id.map(site_ids_offsets)

    # add offset
    temp_df['timestamp_aligned'] = (temp_df.timestamp - pd.to_timedelta(temp_df.offset, unit='H'))
  
    # replace the timestamp with aligned timestamps in the original dataframe
    df['timestamp']=temp_df['timestamp_aligned']

    return df 

  #merging test and building_metadata
  merge_df=building_metadata.merge(test,on='building_id',how='right')
  
  #converting timestamp to datetime format
  weather_test['timestamp']= pd.to_datetime(weather_test['timestamp'],infer_datetime_format=True)
  merge_df['timestamp']=pd.to_datetime(merge_df['timestamp'],infer_datetime_format=True)
  
  weather_test=time_alignment(weather_test)
  
  #merging waether_test and merge_df
  merge_df=merge_df.merge(weather_test,on=['site_id','timestamp'],how='left')

  #removing features with high nan values and unimportant features
  merge_df=merge_df.drop(['cloud_coverage','precip_depth_1_hr','floor_count','year_built','site_id','wind_direction','sea_level_pressure','row_id'],axis=1)

  #converting square_feet to log values
  merge_df['square_feet'] = np.log1p(merge_df['square_feet'])


  # adding month, day, and hour feature
  merge_df['Month']=pd.DatetimeIndex(merge_df['timestamp']).month
  merge_df['Day']=pd.DatetimeIndex(merge_df['timestamp']).day
  merge_df['Hour']=pd.DatetimeIndex(merge_df['timestamp']).hour

  #one hot encoding primary_use feature
  merge_df=pd.get_dummies(merge_df, columns=['primary_use'])

  #adding dayofweek and season
  merge_df["day_of_week"]=np.uint8(merge_df['timestamp'].dt.dayofweek)
  merge_df['season']= merge_df['Month'].apply(lambda x: 'Spring' if x==3 or x==4 or x==5 else 'Summer' if 
                                                x==6 or x==7 or x==8 
                                                else 'Autumn' if x==9 or x==10 or 
                                                x==11 else 'Winter')
  
  # one hot encoding season feature
  merge_df=pd.get_dummies(merge_df, columns=['season'])

  merge_df=merge_df.drop(['timestamp'],axis=1)
 
  #predicting test data 
  test_pred=lgbm_model.predict(merge_df)
  test_pred=np.round(test_pred,4)
    
  #converting target variable to logarithmic values
  y_act=np.round(np.log1p(y_actual),4)
   
  #getting RMSLE score
  RMSLE=MSE(y_act,test_pred,squared=False)

  return RMSLE
