In [None]:
import sys
import pandas as pd

## Building

In [None]:
building = pd.read_csv("../../data/raw/building_metadata.csv")

In [None]:
building

In [None]:
building.dtypes

In [None]:
building.count()

In [None]:
building = building.astype({'building_id': "int32", 'site_id': 'int32'})

In [None]:
for c in building.columns:
    if building[c].dtype == "float":
        building[c] = building[c].astype("float32")

In [None]:
building.to_feather("../../data/raw/building_metadata.feather")

## Weather

In [None]:
weather = pd.read_csv("../../data/raw/weather.csv")

In [None]:
weather.dtypes

In [None]:
weather["timestamp"] = pd.to_datetime(
            weather["timestamp"], format='%Y-%m-%d %H:%M:%S')

In [None]:
def to_float32(df):
    for c in df.columns:
        if df[c].dtype == "float":
            df[c] = df[c].astype("float32")
    return df

In [None]:
weather = to_float32(weather)

In [None]:
weather = weather.astype({'site_id': 'int32'})

In [None]:
weather.to_feather("../../data/raw/weather.feather")

In [None]:
weather.dtypes

## Meters

In [None]:
meters = pd.read_csv("../../data/raw/meters.csv")

In [None]:
meters.shape

In [None]:
meters = meters.sample(frac=0.1, random_state=0)

In [None]:
meters["timestamp"] = pd.to_datetime(
            meters["timestamp"], format='%Y-%m-%d %H:%M:%S')

In [None]:
meters = meters[meters.building_id != 778]

In [None]:
meters.shape

In [None]:
meters.dtypes

In [None]:
meters = meters.astype({'building_id': 'int32', "meter": "int32", "meter_reading": "float32"})

In [None]:
meters = meters.reset_index(drop=True)

In [None]:
meters.dtypes

In [None]:
meters.to_feather("../../data/raw/meters.feather")

## Merge

In [None]:
data_dir = "/home/jovyan/personal_workspace/bootcamp/data"

In [None]:
from tqdm import notebook
import os

In [None]:
FILES_FOLDER = {
    'raw': ['raw/building_metadata.feather', 'raw/meters.feather', 'raw/weather.feather'],
    'clean': ['clean/building_metadata.feather', 'clean/meters.feather', 'clean/weather.feather'],
    'merged': ['merged/data.feather'],
    'model': ['model/train.feather', 'model/val.feather', 'model/test.feather']
}


def load_data(folder, data_dir, dict_files=FILES_FOLDER):
    files = dict_files[folder]

    dataframes = []

    print(":: Start loading data")
    for name_file in notebook.tqdm(files):
        dataframe = pd.read_feather(os.path.join(data_dir, name_file))
        dataframes.append(dataframe)

    return dataframes

In [None]:
building, meters, weather = load_data("raw", data_dir)

In [None]:
building

In [None]:
meters.dtypes

In [None]:
weather

In [None]:
dataset = meters.merge(building, how='left', on='building_id')

In [None]:
dataset = dataset.merge(
    weather,
    how='left',
    on=['site_id', 'timestamp']
)

In [None]:
dataset

In [None]:
dataset.count()