In [0]:
#Kaggle link: https://www.kaggle.com/c/ashrae-energy-prediction
import zipfile
import pandas as pd
with zipfile.ZipFile("/content/sample_data/ashrae-energy-prediction-final-test.zip") as z:
    with z.open("ashrae-energy-prediction/building_metadata.csv") as f:
        building_meta = pd.read_csv(f)
    with z.open("ashrae-energy-prediction/train.csv") as f:
        train_data = pd.read_csv(f)
    with z.open("ashrae-energy-prediction/weather_train.csv") as f:
        weather_train = pd.read_csv(f)
    with z.open("ashrae-energy-prediction/weather_test.csv") as f:
        weather_test = pd.read_csv(f)
    with z.open("ashrae-energy-prediction/test.csv") as f:
        test_data = pd.read_csv(f)

In [0]:
#weather_train.isnull().sum()

In [0]:
weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'])

In [0]:
for site in weather_train['site_id'].unique():
  #site_weather = weather_train[weather_train['site_id'] == site]
  weather_train[weather_train['site_id'] == site] = weather_train[weather_train['site_id'] == site].interpolate(method="linear")

In [0]:
weather_train_interpolated = pd.DataFrame([])

In [0]:
startDate = pd.to_datetime("2016-01-01 00:00:00")
endDate = pd.to_datetime("2016-12-31 23:00:00")

In [0]:
for site in weather_train['site_id'].unique():
  site_weather = weather_train[weather_train['site_id'] == site]
  site_weather = site_weather.set_index('timestamp')
  site_weather_reindexed = site_weather.resample("1H")
  #new_index = pd.date_range(startDate,endDate,freq="1H")
  #site_weather_reindexed = site_weather.reindex(new_index)
  site_weather_reindexed = site_weather_reindexed.interpolate(method="linear")
  weather_train_interpolated = weather_train_interpolated.append(site_weather_reindexed)

In [0]:
weather_train_interpolated = weather_train_interpolated.reset_index()

In [0]:
#weather_train_interpolated.isnull().sum()

In [0]:
#Filling missing values for sea_level_pressure with mean and precip_depth_1_hr, cloud_coverage with mode
weather_train_interpolated.loc[(weather_train_interpolated['sea_level_pressure'].isnull()), 'sea_level_pressure'] = weather_train_interpolated['sea_level_pressure'].mean()
#weather_train_interpolated.loc[(weather_train_interpolated['precip_depth_1_hr'].isnull()), 'precip_depth_1_hr'] = 0.0
weather_train_interpolated.loc[(weather_train_interpolated['cloud_coverage'].isnull()), 'cloud_coverage'] = 0.0

In [0]:
#weather_train_interpolated.isnull().sum()

In [0]:
#building_meta.isnull().sum()

In [0]:
building_meta.loc[(building_meta['floor_count'].isnull()), 'floor_count'] = 1
building_meta.loc[(building_meta['year_built'].isnull()), 'year_built'] = 1976.0

In [0]:
#Creating new feature called total area based on floor count and square feet
building_meta['total_area'] = building_meta['floor_count'] * building_meta['square_feet']

In [0]:
#Dropping square feet and floor count column
building_meta.drop(columns=['square_feet','floor_count'],inplace=True)

In [0]:
#train_data.isnull().sum()

In [0]:
#building_meta = pd.get_dummies(building_meta, columns=['primary_use'])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
building_meta['primary_use']= label_encoder.fit_transform(building_meta['primary_use'])

In [0]:
merged_train_building = pd.merge(train_data, building_meta, on='building_id')

In [0]:
#merged_train_building.head()

In [0]:
#merged_train_building.dtypes
merged_train_building['timestamp'] = pd.to_datetime(merged_train_building['timestamp'])

In [0]:
#merged_train_building.head()

In [0]:
#weather_train_interpolated.head()

In [0]:
#merged_train_building.shape

In [0]:
#weather_train_interpolated.shape

In [0]:
train = pd.merge(merged_train_building, weather_train_interpolated, on=['site_id','timestamp'])

In [0]:
site_zero_buildings = train[train['site_id'] == 0]['building_id'].values.flatten().tolist()

In [0]:
kbtu_conversion_factor = 0.293071
train.loc[(train['building_id'].isin(site_zero_buildings)) & (train['meter'] == 0),'meter_reading'] = train.loc[(train['building_id'].isin(site_zero_buildings)) & (train['meter'] == 0),'meter_reading']*kbtu_conversion_factor

In [0]:
train['weekday'] = train['timestamp'].dt.weekday

In [0]:
train['weekday'] = train['weekday'].isin([0,1,2,3,4])

In [0]:
#Extracting hour from timestamp
train['hour'] = train['timestamp'].dt.hour

In [0]:
#Extracting month from timestamp
train['month'] = train['timestamp'].dt.month

In [0]:
#Extracting year from timestamp
train['year'] = train['timestamp'].dt.year

In [0]:
#Creating new feature called building age based on this previous year and the year which building was build
train['building_age'] = train['year'] - train['year_built']

In [0]:
#Drop unwanted columns related to year
#train = train.drop(columns=['year_built', 'year'])
del train['year_built']
del train['year']

**Data Normalization**

In [0]:
from sklearn.preprocessing import PowerTransformer
at_pt = PowerTransformer(method='yeo-johnson')
train['air_temperature'] = at_pt.fit_transform(train[['air_temperature']])
train['air_temperature'].skew()

-0.15778677535064728

In [0]:
cc_pt = PowerTransformer(method='yeo-johnson')
train['cloud_coverage'] = cc_pt.fit_transform(train[['cloud_coverage']])
train['cloud_coverage'].skew()

-0.003063201517380995

In [0]:
# dew_pt = PowerTransformer(method='yeo-johnson')
# train['dew_temperature'] = dew_pt.fit_transform(train[['dew_temperature']])
# print(train['dew_temperature'].skew())

In [0]:
# pre_pt = PowerTransformer(method='yeo-johnson')
# train['precip_depth_1_hr'] = pre_pt.fit_transform(train[['precip_depth_1_hr']])
# print(train['precip_depth_1_hr'].skew())

In [0]:
ws_pt = PowerTransformer(method='yeo-johnson')
train['wind_speed'] = ws_pt.fit_transform(train[['wind_speed']])
print(train['wind_speed'].skew())

-0.021589399707228635


In [0]:
ta_pt = PowerTransformer(method='yeo-johnson')
train['total_area'] = ta_pt.fit_transform(train[['total_area']])
print(train['total_area'].skew())

0.01976005042213119


In [0]:
ba_pt = PowerTransformer(method='yeo-johnson')
train['building_age'] = ba_pt.fit_transform(train[['building_age']])
print(train['building_age'].skew())

0.24334356637168034


In [0]:
mr_pt = PowerTransformer(method='yeo-johnson')
train['meter_reading'] = mr_pt.fit_transform(train[['meter_reading']])
print(train['meter_reading'].skew())

0.0025713174838154786


In [0]:
#train.head()

In [0]:
#One hot encoding
#train = pd.get_dummies(train, columns=['primary_use'])

In [0]:
#train.head()

In [0]:
target = train['meter_reading']
del train['meter_reading']

In [0]:
#Splitting
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.05, random_state=9)
print(X_train.shape)
print(X_test.shape)

(19204236, 17)
(1010750, 17)


In [0]:
def df_clean(df):
 #cols = ['meter', 'total_area', 'air_temperature', 'cloud_coverage', 'dew_temperature',  'precip_depth_1_hr',  'sea_level_pressure', 'wind_direction', 'wind_speed', 'weekday',  'hour', 'month',  'building_age', 'primary_use_Education',  'primary_use_Entertainment/public assembly',  'primary_use_Food sales and service', 'primary_use_Healthcare', 'primary_use_Lodging/residential',  'primary_use_Manufacturing/industrial', 'primary_use_Office', 'primary_use_Other',  'primary_use_Parking',  'primary_use_Public services',  'primary_use_Religious worship',  'primary_use_Retail', 'primary_use_Services', 'primary_use_Technology/science', 'primary_use_Utility',  'primary_use_Warehouse/storage']
 cols = ['meter', 'total_area', 'air_temperature', 'cloud_coverage', 'sea_level_pressure', 'wind_speed', 'weekday',  'hour', 'month',  'building_age', 'primary_use']
 #df_copy = df.copy()
 #df_copy = df_copy[cols]
 return df[cols]

In [0]:
X_train_model = df_clean(X_train)
X_test_model = df_clean(X_test)

In [0]:
# def rmsle(expected, predicted):
#    return np.sqrt(np.mean(np.square(np.log(predicted + 1) - np.log(expected + 1))))

In [0]:
#Training - Random Forest
from sklearn.ensemble import RandomForestRegressor
lr = RandomForestRegressor(max_features=4, min_samples_split=4, n_estimators=10, min_samples_leaf=2,n_jobs = -1)
lr.fit(X_train_model, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=4, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=4,
                      min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [0]:
# train_pred = lr.predict(X_train_model)
# X_train['meter_reading_predicted'] = train_pred

In [0]:
# X_train['meter_reading_predicted'] = mr_pt.inverse_transform(X_train['meter_reading_predicted'].values.reshape([-1,1]))

In [0]:
# X_train.head()

In [0]:
# X_train.loc[(X_train['building_id'].isin(site_zero_buildings)) & (X_train['meter'] == 0),'meter_reading_predicted'] = X_train.loc[(X_train['building_id'].isin(site_zero_buildings)) & (X_train['meter'] == 0),'meter_reading_predicted']/kbtu_conversion_factor

In [0]:
# X_train['meter_reading'] = mr_pt.inverse_transform(y_train.values.reshape([-1,1]))

In [0]:
# X_train.loc[(X_train['building_id'].isin(site_zero_buildings)) & (X_train['meter'] == 0),'meter_reading'] = X_train.loc[(X_train['building_id'].isin(site_zero_buildings)) & (X_train['meter'] == 0),'meter_reading']/kbtu_conversion_factor

In [0]:
# import numpy as np
# print('RMSLE:', rmsle(X_train['meter_reading'], X_train['meter_reading_predicted']))

In [0]:
# train_pred = lr.predict(X_test_model)
# X_test['meter_reading_predicted'] = train_pred

In [0]:
# X_test['meter_reading_predicted'] = mr_pt.inverse_transform(X_test['meter_reading_predicted'].values.reshape([-1,1]))

In [0]:
# X_test.loc[(X_test['building_id'].isin(site_zero_buildings)) & (X_test['meter'] == 0),'meter_reading_predicted'] = X_test.loc[(X_test['building_id'].isin(site_zero_buildings)) & (X_test['meter'] == 0),'meter_reading_predicted']/kbtu_conversion_factor

In [0]:
# X_test['meter_reading'] = mr_pt.inverse_transform(y_test.values.reshape([-1,1]))

In [0]:
# X_test.loc[(X_test['building_id'].isin(site_zero_buildings)) & (X_test['meter'] == 0),'meter_reading'] = X_test.loc[(X_test['building_id'].isin(site_zero_buildings)) & (X_test['meter'] == 0),'meter_reading']/kbtu_conversion_factor

In [0]:
# print('Test RMSLE:', rmsle(X_test['meter_reading'], X_test['meter_reading_predicted']))

In [0]:
# import joblib
# joblib.dump([lr,at_pt,cc_pt,dew_pt,pre_pt,ws_pt,ta_pt,ba_pt,mr_pt],'lr_model.pkl')

In [0]:
# test_data.head()

In [0]:
# weather_test.head()

In [0]:
weather_test['timestamp'] = pd.to_datetime(weather_test['timestamp'])
test_data['timestamp'] = pd.to_datetime(test_data['timestamp'])

In [0]:
for site in weather_test['site_id'].unique():
  weather_test[weather_test['site_id'] == site] = weather_test[weather_test['site_id'] == site].interpolate(method="linear")

In [0]:
# weather_test.isnull().sum()

In [0]:
#Filling missing values for sea_level_pressure with mean and precip_depth_1_hr, cloud_coverage with mode
weather_test.loc[(weather_test['sea_level_pressure'].isnull()), 'sea_level_pressure'] = weather_test['sea_level_pressure'].mean()
#weather_test.loc[(weather_test['precip_depth_1_hr'].isnull()), 'precip_depth_1_hr'] = 0.0
weather_test.loc[(weather_test['cloud_coverage'].isnull()), 'cloud_coverage'] = 0.0

In [0]:
# test_data.shape

In [0]:
merged_test_building = pd.merge(test_data, building_meta, on='building_id')

In [0]:
# merged_test_building.shape

In [0]:
test_final = pd.merge(merged_test_building, weather_test, on=['site_id','timestamp'],how="left")

In [0]:
# test_final.shape

In [0]:
# test_final.isnull().sum()

In [0]:
test_final.loc[(test_final['air_temperature'].isnull()), 'air_temperature'] = test_final['air_temperature'].mean()
test_final.loc[(test_final['cloud_coverage'].isnull()), 'cloud_coverage'] = test_final['cloud_coverage'].mean()
#test_final.loc[(test_final['dew_temperature'].isnull()), 'dew_temperature'] = test_final['dew_temperature'].mean()
#test_final.loc[(test_final['precip_depth_1_hr'].isnull()), 'precip_depth_1_hr'] = 0
test_final.loc[(test_final['sea_level_pressure'].isnull()), 'sea_level_pressure'] = test_final['sea_level_pressure'].mean()
#test_final.loc[(test_final['wind_direction'].isnull()), 'wind_direction'] = test_final['wind_direction'].mean()
test_final.loc[(test_final['wind_speed'].isnull()), 'wind_speed'] = test_final['wind_speed'].mean()

In [0]:
test_final['weekday'] = test_final['timestamp'].dt.weekday

In [0]:
test_final['weekday'] = test_final['weekday'].isin([0,1,2,3,4])

In [0]:
test_final['hour'] = test_final['timestamp'].dt.hour

In [0]:
test_final['month'] = test_final['timestamp'].dt.month

In [0]:
test_final['year'] = test_final['timestamp'].dt.year

In [0]:
test_final['building_age'] = test_final['year'] - test_final['year_built']

In [0]:
del test_final['year_built']
del test_final['year']

In [0]:
test_final['air_temperature'] = at_pt.transform(test_final[['air_temperature']])

In [0]:
test_final['cloud_coverage'] = cc_pt.transform(test_final[['cloud_coverage']])

In [0]:
#['dew_temperature'] = dew_pt.transform(test_final[['dew_temperature']])

In [0]:
#test_final['precip_depth_1_hr'] = pre_pt.transform(test_final[['precip_depth_1_hr']])

In [0]:
test_final['wind_speed'] = ws_pt.transform(test_final[['wind_speed']])

In [0]:
test_final['total_area'] = ta_pt.transform(test_final[['total_area']])

In [0]:
test_final['building_age'] = ba_pt.transform(test_final[['building_age']])

In [0]:
x_test_final_model = df_clean(test_final)

In [0]:
#test_final_pred = lr.predict(x_test_final_model)
#test_final['meter_reading'] = test_final_pred

In [0]:
small_count = int(len(x_test_final_model)/10)
test_final_latest = pd.DataFrame([])
for i in range(10):
  #test_final_pred = lr.predict(x_test_final_model[small_count*i:small_count*(i+1)])
  test_final_latest = test_final_latest.append(pd.DataFrame(lr.predict(x_test_final_model[small_count*i:small_count*(i+1)])))

In [0]:
test_final['meter_reading'] = test_final_latest.values.flatten()

In [0]:
test_final['meter_reading'] = mr_pt.inverse_transform(test_final['meter_reading'].values.reshape([-1,1]))

In [0]:
#test_final.loc[(test_final['building_id'].isin(site_zero_buildings)) & (test_final['meter'] == 0),'meter_reading'] = test_final.loc[(test_final['building_id'].isin(site_zero_buildings)) & (test_final['meter'] == 0),'meter_reading']/kbtu_conversion_factor

In [0]:
del test_final['total_area']
del test_final['air_temperature']
del test_final['cloud_coverage']
#del test_final['dew_temperature']
#del test_final['precip_depth_1_hr']
del test_final['sea_level_pressure']
#del test_final['wind_direction']
del test_final['wind_speed']
del test_final['weekday']
del test_final['hour']
del test_final['month']
del test_final['building_age']
del test_final['primary_use']
# del test_final['primary_use_Entertainment/public assembly']
# del test_final['primary_use_Food sales and service']
# del test_final['primary_use_Healthcare']
# del test_final['primary_use_Lodging/residential']
# del test_final['primary_use_Manufacturing/industrial']
# del test_final['primary_use_Office']
# del test_final['primary_use_Other']
# del test_final['primary_use_Parking']
# del test_final['primary_use_Public']
# del test_final['primary_use_Religious worship']
# del test_final['primary_use_Retail']
# #del test_final['primary_use_Services']
# del test_final['primary_use_Technology/science']
# del test_final['primary_use_Utility']
# del test_final['primary_use_Warehouse/storage']


In [0]:
#del test_final['primary_use_Public services']

In [0]:
#test_final.head()

In [0]:
test_final.loc[(test_final['building_id'].isin(site_zero_buildings)) & (test_final['meter'] == 0),'meter_reading'] = test_final.loc[(test_final['building_id'].isin(site_zero_buildings)) & (test_final['meter'] == 0),'meter_reading']/kbtu_conversion_factor

In [0]:
#test_final.to_csv("train_final.csv",index=False)

In [0]:
submission_df = test_final[['row_id','meter_reading']]
submission_df = submission_df.sort_values(by=["row_id"])
#test_final.to_csv("submission.csv",index=False)

In [0]:
submission_df.to_csv("submission_rf.csv",index=False)

In [0]:
#test_final.loc[(test_final['building_id'].isin(site_zero_buildings)) & (test_final['meter'] == 0),'meter_reading'] = test_final.loc[(test_final['building_id'].isin(site_zero_buildings)) & (test_final['meter'] == 0),'meter_reading']/kbtu_conversion_factor

In [0]:
#store = pd.read_csv('/content/submission.csv')

In [0]:
#store.shape

In [0]:
#Finally got the submission data with exact rows