# Submission 0

### Imports

In [3]:
%matplotlib inline

import gc
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# import ray.dataframe as pd

In [4]:
# Default plot settings
sns.set(rc={'figure.figsize': (16, 6), 
            'font.size': 12})

### Train data

In [9]:
path = '../submissions/sub0/'

In [10]:
dtypes = {
    'building_id': 'uint16',
    'use_encoded': 'uint8',
    'year_built': 'uint16',
    'square_feet': 'uint32',
    'day': 'uint8',
    'hour': 'uint8',
    'weekday': 'uint8',
    'day_of_year': 'uint16',
    'meter': 'uint8',
    'air_temperature': 'float32',
    'sea_level_pressure': 'float32',
    'cloud_coverage': 'uint8',
    'precip_depth_1_hr': 'float32',
    'wind_direction': 'uint16',
    'wind_speed': 'float32'
}

In [11]:
df = pd.read_csv(f'{path}train.csv', dtype=dtypes).iloc[:, 1:]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20137746 entries, 0 to 20137745
Data columns (total 15 columns):
building_id           uint16
use_encoded           uint8
year_built            uint16
square_feet           uint32
day                   uint8
hour                  uint8
weekday               uint8
meter                 uint8
air_temperature       float32
sea_level_pressure    float32
cloud_coverage        uint8
precip_depth_1_hr     float32
wind_direction        uint16
wind_speed            float32
day_of_year           uint16
dtypes: float32(4), uint16(4), uint32(1), uint8(6)
memory usage: 653.0 MB


In [20]:
lb = pd.read_csv(f'{path}label.csv', dtype='float32', names=['i','reading']).iloc[:, 1:]
lb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20137746 entries, 0 to 20137745
Data columns (total 1 columns):
reading    float32
dtypes: float32(1)
memory usage: 76.8 MB


In [13]:
df.head()

Unnamed: 0,building_id,use_encoded,year_built,square_feet,day,hour,weekday,meter,air_temperature,sea_level_pressure,cloud_coverage,precip_depth_1_hr,wind_direction,wind_speed,day_of_year
0,0,0,2008,7432,1,0,4,0,19.4,1019.762939,4,0.0,360,0.0,1
1,1,0,2004,2720,1,0,4,0,19.4,1019.762939,4,0.0,360,0.0,1
2,2,0,1991,5376,1,0,4,0,19.4,1019.762939,4,0.0,360,0.0,1
3,3,0,2002,23685,1,0,4,0,19.4,1019.762939,4,0.0,360,0.0,1
4,4,0,1975,116607,1,0,4,0,19.4,1019.762939,4,0.0,360,0.0,1


In [21]:
lb.tail()

Unnamed: 0,reading
20137741,8.75
20137742,4.825
20137743,0.0
20137744,159.574997
20137745,2.85


### Test data

In [22]:
data_path='../data/raw/test/'

In [28]:
meter = pd.read_csv(f'{data_path}test.csv', parse_dates=['timestamp'], dtype={'building_id': 'uint16', 'meter': 'uint8'})[1:]
meter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697599 entries, 1 to 41697599
Data columns (total 4 columns):
row_id         int64
building_id    uint16
meter          uint8
timestamp      datetime64[ns]
dtypes: datetime64[ns](1), int64(1), uint16(1), uint8(1)
memory usage: 755.6 MB


In [None]:
dtypes = {
    'site_id': 'uint16',
    'air_temperature': 'float32',
    'dew_temperature': 'float32',
    'sea_level_pressure': 'float32',
    'cloud_coverage': 'uint8',
    'precip_depth_1_hr': 'float32',
    'wind_direction': 'uint16',
    'wind_speed': 'float32'
}

In [30]:
weather = pd.read_csv(f'{data_path}weather_test.csv', parse_dates=['timestamp'])
weather.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2017-01-01 00:00:00,17.8,4.0,11.7,,1021.4,100.0,3.6
1,0,2017-01-01 01:00:00,17.8,2.0,12.8,0.0,1022.0,130.0,3.1
2,0,2017-01-01 02:00:00,16.1,0.0,12.8,0.0,1021.9,140.0,3.1
3,0,2017-01-01 03:00:00,17.2,0.0,13.3,0.0,1022.2,140.0,3.1
4,0,2017-01-01 04:00:00,16.7,2.0,13.3,0.0,1022.3,130.0,2.6


In [31]:
building = pd.read_csv('../data/output/eda_building.csv')[1:]
building.head()

Unnamed: 0.1,Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count,use_encoded
0,0,0,0,Education,7432,2008,1,0
1,1,0,1,Education,2720,2004,1,0
2,2,0,2,Education,5376,1991,1,0
3,3,0,3,Education,23685,2002,1,0
4,4,0,4,Education,116607,1975,4,0
