# Convert to HDF5

This notebook has instructions on how to prepare final HDF5 dataset. It assumes you have followed the instructions to get consolidated data in `final/` folder. Each json file has entries for that WSID indexed to a global unified index.

In [None]:
import re
import h5py # main dumping method
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import orjson

import vaex
import json
from glob import glob
from ftfy import ftfy
from io import StringIO
from dateparser import parse
from collections import Counter
# from tqdm import trange
from tqdm.notebook import trange

import math
from datetime import timedelta, date

from functools import reduce

In [None]:
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

In [None]:
COLUMNS = [
    ("total_precipitation", "mm"),
    ("pressure", "mB"),
    ("max_pressure", "mB"),
    ("min_pressure", "mB"),
    ("radiation", "KJ/m^2"),
    ("temp", "C"),
    ("dew_point_temp", "C"),
    ("max_temp", "C"),
    ("min_temp", "C"),
    ("max_dew", "C"),
    ("min_dew", "C"),
    ("max_humidity", "percentage"),
    ("min_humidity", "percentage"),
    ("humidity", "percentage"),
    ("wind_direction", "deg"),
    ("wind_gust", "m/s"),
    ("wind_speed", "m/s")
]
COL_NAMES = [x[0] for x in COLUMNS]
UNITS = [x[1] for x in COLUMNS]
BAD_ROW_SUM = 0.0

def open_csv(f, wsid):
    with open(f, "r", encoding="iso8859_1") as d:
        df = pd.read_csv(
            StringIO("".join(d.readlines()[8:]).replace("-9999", "0")),
            sep=";",
            encoding="iso8859_1"
        )

    df = df.drop("Unnamed: 19", axis = 1)
    
    if wsid:
        local_col_names = [f"{wsid}_{c}" for c in COL_NAMES]
    else:
        local_col_names = [f"{c}" for c in COL_NAMES]
    
    df.columns = ["date", "hour"] + local_col_names
    for col in local_col_names[2:]:
        setattr(df, col, getattr(df, col).apply(lambda x: float(str(x).replace(",", "."))))
        
    return df

In [None]:
final_files = [x for x in glob("final/*.json") if ("index" not in x and "wsids_ordered" not in x)]
print(f"final_files: {len(final_files)}")

with open("final/index.json", "r") as f:
    unified_idx = json.load(f)
print(len(unified_idx))

In [None]:
gids_by_file = {}
for i, f in zip(trange(len(final_files)), sorted(final_files)): # sorted is very important because this way we can gaurantee ordering in  array i,f in enumerate(sorted(final_files)):
    with open(f, "r") as f2:
        data = json.load(f2)
    gids_by_file[f] = list(data.keys())

In [None]:
# min_year = 2.5

year_wise_sample_count = {}
for min_year in np.linspace(1, 5, 9):
    samples = [len(x) for x in gids_by_file.values()]
    # np.mean(samples), np.median(samples)
    useful_files = []
    useful_files_gid = {}

    for f in final_files:
        # ignore all samples have < 2 years of data, ie < 24 * 365 * 2 samples => < 17520 samples
        if len(gids_by_file[f]) >= 24 * 365 * min_year:
            useful_files.append(f)

    print(f"min: {min_year} Before: {len(gids_by_file)} After: {len(useful_files)}")
    
    year_wise_sample_count[min_year] = len(useful_files)

In [None]:
min_year = 4.5

samples = [len(x) for x in gids_by_file.values()]
# np.mean(samples), np.median(samples)
useful_files = []
useful_files_gid = {}

for f in final_files:
    # ignore all samples have < 2 years of data, ie < 24 * 365 * 2 samples => < 17520 samples
    if len(gids_by_file[f]) >= 24 * 365 * min_year:
        useful_files.append(f)

useful_files = sorted(useful_files)
useful_wsids = [x[6:10] for x in useful_files]
# print(f"min: {min_year} Before: {len(gids_by_file)} After: {len(useful_files)}")

#### Positions Meta

```python
wsmeta = pd.read_csv("../INMET/wsid_meta.csv")
wsmeta = wsmeta.T
headers = wsmeta.iloc[0].values.tolist()
wsmeta = wsmeta[1:]
wsmeta.columns = headers
wsmeta = wsmeta[wsmeta.elev != "F"] # corrupt data

wsmeta.lat = wsmeta.lat.values.astype(float)
wsmeta.long = wsmeta.long.values.astype(float)
wsmeta.elev = wsmeta.elev.values.astype(float)

wsm = json.loads(wsmeta.to_json(orient="index"))

with open("wsid_meta.json", "w") as f:
    f.write(json.dumps(wsm))

```

In [None]:
with open("wsid_meta.json", "r") as f:
    wsmeta = json.load(f)

wsmeta_ordered = []
for x in useful_wsids:
    wm = wsmeta[x]
    wsmeta_ordered.extend([wm["lat"], wm["long"], wm["elev"]])
wsmeta_ordered = np.array(wsmeta_ordered)
print(f"wsmeta_ordered: {wsmeta_ordered.shape}")

In [None]:
with open("final/index.json", "r") as f:
    unified_idx = json.load(f)
print(len(unified_idx))

In [None]:
datetime_data = []
for k in sorted(unified_idx):
    x = unified_idx[k]
    mon = int(x[5:7])
    day = int(x[8:10])
    hrs = int(x[11:12])
    datetime_data.append((mon, day, hrs))
datetime_data = np.array(datetime_data)

In [None]:
# define the schema in hdf5
hdf = h5py.File("weatherGiga2.hdf5", "w")

hdf.create_dataset("wsid_meta", shape = wsmeta_ordered.shape, dtype = 'f', data = wsmeta_ordered)
hdf.create_dataset("datetime", shape = datetime_data.shape, dtype = 'i', data = datetime_data)

for _, i in zip(trange(len(unified_idx)), unified_idx):
    grp = hdf.create_group(f"{i}")
    data = np.zeros(shape = (len(useful_files), 17)).astype(np.float32)
    grp.create_dataset("data", shape = data.shape, dtype = 'f', data = data)
    grp.create_dataset("mask", shape = [data.shape[0]], dtype = 'f', data = data[:,0])
    
hdf.close()

In [None]:
hdf = h5py.File("weatherGiga2.hdf5", "r+")

# sorted is very important because this way we can gaurantee ordering in  array
uf = sorted(useful_files)
pb1 = trange(len(uf))
for i,f in zip(pb1, uf):
    
    if int(i) < 1:
        continue
    
    w = useful_wsids[i]
    pb1.set_description(f"{f}")
    with open(f, "r") as f2:
        data = json.loads(f2.read())

    for _, gid in zip(trange(len(gids_by_file[f])), gids_by_file[f]):
        this_idx_data = hdf[f"{gid}"]
        if str(sum(data[gid][2:])) != "nan" and sum(data[gid][2:]) != 0:
            try:
                this_idx_data["data"][i*17: (i+1)*17] = data[gid]
                this_idx_data["mask"][i] = 1
            except:
                continue
hdf.close()