# Generate parquet for ingestion

In order to ingest data into the S2S database, it needs to follow a specific schema. This notebook will explore that schema and combine the various CSV files into a final output parquet file.

In [1]:
import sys
import os
import urllib3
import boto3
import ssl
import warnings

import geopandas as gpd
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from shapely.geometry import Point
from operator import itemgetter

sys.path.append(r"C:\WBG\Work\Code\GOSTrocks\src")

from GOSTrocks.misc import tPrint

ssl._create_default_https_context = ssl._create_unverified_context

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

s3 = boto3.client('s3', verify=False)


In [7]:
good_schema_example = "s3://wbg-geography01/Space2Stats/parquet/GLOBAL/GHS_SMOD_counts_and_pop.parquet"
good_schema = pd.read_parquet(good_schema_example)
good_schema.head()

Unnamed: 0,hex_id,ghs_11_count,ghs_12_count,ghs_13_count,ghs_21_count,ghs_22_count,ghs_23_count,ghs_30_count,ghs_total_count,ghs_11_pop,ghs_12_pop,ghs_13_pop,ghs_21_pop,ghs_22_pop,ghs_23_pop,ghs_30_pop,ghs_total_pop
0,860000007ffffff,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,86000000fffffff,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,860000017ffffff,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,86000001fffffff,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,860000027ffffff,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The table schema for tables to be imported into Space2Stats is simple with only two columns required:
- hex_id: the h3 grid id for the polygon used in the zonal calculation; default is h3 level 6
- sum_pop_f_0_2020: attribute of interest to be ingested, should be written in the following style __f'{aggregation_method}_{attribute_name}'__

# Combine output CSV files into single parquet

In [6]:
#Search output file for relevant CSV files
'''bucket = 'wbg-geography01' 
prefix = 'Space2Stats/h3_stats_data/GLOBAL/VIIRS_Monthly_LEN/'
region = 'us-east-1'
s3client = boto3.client('s3', region_name=region, verify=False)
'''

csv_folder = "C:/WBG/Work/S2S/data/GHSL"
csv_files = []
for root, dirs, files in os.walk(csv_folder):
    for file in files:
        if file.endswith(".csv"):
            csv_files.append(os.path.join(root, file))

In [26]:
# Create empty data frame based on the schema of the good schema
out_res = pd.DataFrame(index=good_schema['hex_id'])
for c_year in range(1975, 2031, 5):
    out_res[f"sum_built_area_m_{c_year}"] = None

out_res.head()

Unnamed: 0_level_0,sum_built_area_m_1975,sum_built_area_m_1980,sum_built_area_m_1985,sum_built_area_m_1990,sum_built_area_m_1995,sum_built_area_m_2000,sum_built_area_m_2005,sum_built_area_m_2010,sum_built_area_m_2015,sum_built_area_m_2020,sum_built_area_m_2025,sum_built_area_m_2030
hex_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
860000007ffffff,,,,,,,,,,,,
86000000fffffff,,,,,,,,,,,,
860000017ffffff,,,,,,,,,,,,
86000001fffffff,,,,,,,,,,,,
860000027ffffff,,,,,,,,,,,,


In [27]:
out_folder = r"C:\WBG\Work\S2S\ingest"
print(csv_files[0])

C:/WBG/Work/S2S/data/GHSL\8001fffffffffff\ghsl_built_m_E1975.csv


In [28]:
for res_file in tqdm(csv_files):
    year = os.path.basename(res_file)[-8:-4]
    out_col = f"sum_built_area_m_{year}"

    curD = pd.read_csv(res_file)
    curD.set_index('id', inplace=True)
    out_res.loc[curD.index, out_col] = curD['SUM']

  0%|          | 0/1133 [00:00<?, ?it/s]

In [29]:
out_res.reset_index(inplace=True)
out_res.head()

Unnamed: 0,hex_id,sum_built_area_m_1975,sum_built_area_m_1980,sum_built_area_m_1985,sum_built_area_m_1990,sum_built_area_m_1995,sum_built_area_m_2000,sum_built_area_m_2005,sum_built_area_m_2010,sum_built_area_m_2015,sum_built_area_m_2020,sum_built_area_m_2025,sum_built_area_m_2030
0,860000007ffffff,,,,,,,,,,,,
1,86000000fffffff,,,,,,,,,,,,
2,860000017ffffff,,,,,,,,,,,,
3,86000001fffffff,,,,,,,,,,,,
4,860000027ffffff,,,,,,,,,,,,


In [30]:
out_res.to_parquet(os.path.join(out_folder, "GHSL_built_area_m.parquet"))


# Generate Metadata

In [31]:
out_res.columns

Index(['hex_id', 'sum_built_area_m_1975', 'sum_built_area_m_1980',
       'sum_built_area_m_1985', 'sum_built_area_m_1990',
       'sum_built_area_m_1995', 'sum_built_area_m_2000',
       'sum_built_area_m_2005', 'sum_built_area_m_2010',
       'sum_built_area_m_2015', 'sum_built_area_m_2020',
       'sum_built_area_m_2025', 'sum_built_area_m_2030'],
      dtype='object')

In [32]:
os.path.join(out_folder, "GHSL_built_area_m.parquet")

'C:\\WBG\\Work\\S2S\\ingest\\GHSL_built_area_m.parquet'

# DEBURRGGING
