In [1]:
import os
os.chdir(os.path.dirname(os.getcwd()))

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import joblib
import seaborn as sns
# min max scaling
from sklearn.preprocessing import MinMaxScaler
# standard scaler
from sklearn.preprocessing import StandardScaler

In [3]:
# list paths in 'data/citylearn_challenge_2022_phase_1/' that start with Build
# and end with .csv
files = [f for f in os.listdir('data/citylearn_challenge_2022_phase_1/') if f.startswith('Build') and f.endswith('.csv')]
files

['Building_4.csv',
 'Building_5.csv',
 'Building_1.csv',
 'Building_2.csv',
 'Building_3.csv']

In [4]:
weather = pd.read_csv('data/citylearn_challenge_2022_phase_1/weather.csv')
weather

Unnamed: 0,Outdoor Drybulb Temperature [C],Relative Humidity [%],Diffuse Solar Radiation [W/m2],Direct Solar Radiation [W/m2],6h Prediction Outdoor Drybulb Temperature [C],12h Prediction Outdoor Drybulb Temperature [C],24h Prediction Outdoor Drybulb Temperature [C],6h Prediction Relative Humidity [%],12h Prediction Relative Humidity [%],24h Prediction Relative Humidity [%],6h Prediction Diffuse Solar Radiation [W/m2],12h Prediction Diffuse Solar Radiation [W/m2],24h Prediction Diffuse Solar Radiation [W/m2],6h Prediction Direct Solar Radiation [W/m2],12h Prediction Direct Solar Radiation [W/m2],24h Prediction Direct Solar Radiation [W/m2]
0,20.0,84.0,0.0,0.0,18.3,22.8,20.0,81.0,68.0,81.0,25.0,964.0,0.0,100.0,815.0,0.0
1,20.1,79.0,0.0,0.0,19.4,22.8,19.4,79.0,71.0,87.0,201.0,966.0,0.0,444.0,747.0,0.0
2,19.7,78.0,0.0,0.0,21.1,22.2,19.4,73.0,73.0,87.0,420.0,683.0,0.0,592.0,291.0,0.0
3,19.3,78.0,0.0,0.0,22.2,22.8,19.4,71.0,71.0,90.0,554.0,522.0,0.0,491.0,153.0,0.0
4,18.9,78.0,0.0,0.0,21.7,22.2,18.9,73.0,71.0,90.0,778.0,444.0,0.0,734.0,174.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,20.6,84.0,26.0,130.0,20.1,19.4,20.6,79.0,79.0,73.0,0.0,201.0,27.0,0.0,444.0,143.0
8756,21.1,81.0,0.0,0.0,19.7,21.1,20.0,78.0,73.0,76.0,0.0,420.0,0.0,0.0,592.0,0.0
8757,21.7,79.0,0.0,0.0,19.3,22.2,20.6,78.0,71.0,70.0,0.0,554.0,0.0,0.0,491.0,0.0
8758,21.3,76.0,0.0,0.0,18.9,21.7,20.6,78.0,73.0,73.0,0.0,778.0,0.0,0.0,734.0,0.0


In [9]:
# loop through csv files and extract consumption and generation data
all_buildings = pd.DataFrame()
for file in files:
    df = pd.read_csv('data/citylearn_challenge_2022_phase_1/' + file)
    filename = file.split('.')[0].split('_')[1]
    df = pd.concat([df, weather], axis=1)
    df['day_year'] = df.index
    df['number'] = filename
    #df['weekday_hour'] = df['Day Type'].astype(int).astype(str) + "-" +  df['Hour'].astype(int).astype(str)
    df['Hour'] = df['Hour'] % 24
    df['Hour'] = df['Hour'].shift(-1)
    df["Month"] = df['Month'].shift(-1)

    # df["hour_x"] = np.cos(2*np.pi* df["Hour"] / 24)
    # df["hour_y"] = np.sin(2*np.pi* df["Hour"] / 24)
    
    # df["month_x"] = np.cos(2*np.pi* df["Month"] / (12))
    # df["month_y"] = np.sin(2*np.pi*df["Month"] / (12))

    # df["weekday_x"] = np.cos(2 * np.pi*df["Day Type"] / (7))
    # df["weekday_y"] = np.sin(2 * np.pi*df["Day Type"] / (7))
    
    # rename the equipment electric power column to cons_target
    df.drop(['Equipment Electric Power [kWh]', 'Daylight Savings Status','DHW Heating [kWh]',
       'Cooling Load [kWh]', 'Heating Load [kWh]'], axis=1, inplace=True)

    df['solar_target+1'] = df['Solar Generation [W/kW]'].shift(-1)
    #if the column name contains 'Radiation' or 'radiation', log transform
    for col in df.columns:
        if 'Radiation' in col or 'radiation' in col:
            df[col] = np.log1p(df[col])
    # same for cons_target
    df['diffuse_solar_radiation+1'] = df['12h Prediction Diffuse Solar Radiation [W/m2]'].shift(11)
    df['direct_solar_radiation+1'] = df['12h Prediction Direct Solar Radiation [W/m2]'].shift(11)
    df['relative_humidity+1'] = df['12h Prediction Relative Humidity [%]'].shift(11)
    df['drybulb_temp+1'] = df['12h Prediction Outdoor Drybulb Temperature [C]'].shift(11)

    # Remove column names containing the word 'Prediction'
    df = df[[col for col in df.columns if 'Prediction' not in col]]
        
    all_buildings = pd.concat([all_buildings, df], axis=0)
    print(len(all_buildings))
# drop columns with more than 1000 missing values
all_buildings = all_buildings[all_buildings.columns[all_buildings.isnull().mean() < 0.8]]
all_buildings.dropna(inplace=True)
all_buildings.sort_values(['number', 'day_year'], inplace=True)
all_buildings.reset_index(drop=True, inplace=True)
# convert hour and month to integers
all_buildings['Hour'] = all_buildings['Hour'].astype(int)
all_buildings['Month'] = all_buildings['Month'].astype(int)
all_buildings['Day Type'] = all_buildings['Day Type'].astype(int)
# rename columns ['Hour', 'Month', 'Day Type'] without capital letters
all_buildings.rename(columns={'Hour': 'hour', 'Month': 'month', 'Day Type': 'day_type'}, inplace=True)
all_buildings

8760
17520
26280
35040
43800


Unnamed: 0,month,hour,day_type,Solar Generation [W/kW],Outdoor Drybulb Temperature [C],Relative Humidity [%],Diffuse Solar Radiation [W/m2],Direct Solar Radiation [W/m2],day_year,number,solar_target+1,diffuse_solar_radiation+1,direct_solar_radiation+1,relative_humidity+1,drybulb_temp+1
0,8,12,1,712.750000,22.2,71.0,6.806829,6.699500,11,1,759.825000,6.872128,6.704414,68.0,22.8
1,8,13,1,759.825000,22.8,68.0,6.872128,6.704414,12,1,751.262500,6.874198,6.617403,71.0,22.8
2,8,14,1,751.262500,22.8,71.0,6.874198,6.617403,13,1,689.675000,6.527958,5.676754,73.0,22.2
3,8,15,1,689.675000,22.2,73.0,6.527958,5.676754,14,1,581.275000,6.259581,5.036953,71.0,22.8
4,8,16,1,581.275000,22.8,71.0,6.259581,5.036953,15,1,422.425000,6.098074,5.164786,71.0,22.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43735,7,19,1,31.179167,21.1,81.0,5.389072,5.669881,8754,5,4.070833,3.295837,4.875197,84.0,20.6
43736,7,20,1,4.070833,20.6,84.0,3.295837,4.875197,8755,5,0.000000,0.000000,0.000000,81.0,21.1
43737,7,21,1,0.000000,21.1,81.0,0.000000,0.000000,8756,5,0.000000,0.000000,0.000000,79.0,21.7
43738,7,22,1,0.000000,21.7,79.0,0.000000,0.000000,8757,5,0.000000,0.000000,0.000000,76.0,21.3


In [10]:
# remove rows with dayinyear 1 and 365
#all_buildings = all_buildings[all_buildings['dayinyear'] != 1]
#all_buildings = all_buildings[all_buildings['dayinyear'] != 365]
# save building 4 to test and 0,1,2,3 to train
all_buildings.to_csv('data/extra_solar_train.csv')

In [9]:
all_buildings[all_buildings['building'] != 4][['Month', 'Hour', 'Day Type', 'day_year','dayinyear','timestamp', 'number']]

Unnamed: 0,Month,Hour,Day Type,day_year,dayinyear,timestamp,number
2,8.0,0.0,2.0,48,2,2021-08-02 00:00:00,1
3,8.0,1.0,2.0,49,2,2021-08-02 01:00:00,1
4,8.0,2.0,2.0,50,2,2021-08-02 02:00:00,1
5,8.0,3.0,2.0,51,2,2021-08-02 03:00:00,1
6,8.0,4.0,2.0,52,2,2021-08-02 04:00:00,1
...,...,...,...,...,...,...,...
43653,7.0,19.0,7.0,8755,364,2022-07-30 19:00:00,5
43654,7.0,20.0,7.0,8756,364,2022-07-30 20:00:00,5
43655,7.0,21.0,7.0,8757,364,2022-07-30 21:00:00,5
43656,7.0,22.0,7.0,8758,364,2022-07-30 22:00:00,5


In [14]:
all_buildings.loc[all_buildings['number'] == '4']

Unnamed: 0,Month,Hour,cons_target,Solar Generation [W/kW],Outdoor Drybulb Temperature [C],Relative Humidity [%],Diffuse Solar Radiation [W/m2],Direct Solar Radiation [W/m2],day_year,number,...,net,net_target,net+1,net-23,net_target+1,net_target-23,diffuse_solar_radiation+1,direct_solar_radiation+1,relative_humidity+1,drybulb_temp+1
0,,,0.417676,0.000000,20.0,84.0,0.000000,0.000000,0,4,...,2.819150,0.743323,1.928167,,0.631288,,,,,
1,7.0,24.0,0.285671,0.000000,20.1,79.0,0.000000,0.000000,1,4,...,1.928167,0.631288,0.475817,,0.448667,,,,,
2,8.0,1.0,0.070495,0.000000,19.7,78.0,0.000000,0.000000,2,4,...,0.475817,0.448667,0.458233,,0.446456,,,,,
3,8.0,2.0,0.067890,0.000000,19.3,78.0,0.000000,0.000000,3,4,...,0.458233,0.446456,0.446017,,0.444920,,,,,
4,8.0,3.0,0.066080,0.000000,18.9,78.0,0.000000,0.000000,4,4,...,0.446017,0.444920,0.465883,,0.447418,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,7.0,18.0,0.249649,18.613333,20.6,84.0,3.295837,4.875197,8755,4,...,1.591967,0.589014,0.389433,0.565333,0.437805,0.459923,0.0,0.0,81.0,21.1
8756,7.0,19.0,0.057697,0.000000,21.1,81.0,0.000000,0.000000,8756,4,...,0.389433,0.437805,1.186483,0.630883,0.538028,0.468165,0.0,0.0,79.0,21.7
8757,7.0,20.0,0.175785,0.000000,21.7,79.0,0.000000,0.000000,8757,4,...,1.186483,0.538028,3.155883,2.179383,0.785664,0.662877,0.0,0.0,76.0,21.3
8758,7.0,21.0,0.467565,0.000000,21.3,76.0,0.000000,0.000000,8758,4,...,3.155883,0.785664,2.054717,2.966050,0.647201,0.761794,0.0,0.0,76.0,20.9
