# Generate parquet for ingestion

In order to ingest data into the S2S database, it needs to follow a specific schema. This notebook will explore that schema and combine the various CSV files into a final output parquet file.

In [1]:
import sys
import os
import itertools
import boto3

import geopandas as gpd
import pandas as pd
import numpy as np

from shapely.geometry import Point
from operator import itemgetter

sys.path.append(r"C:\WBG\Work\Code\GOSTrocks\src")

from GOSTrocks.misc import tPrint

In [2]:
good_schema_example = "s3://wbg-geography01/Space2Stats/parquet/GLOBAL/GHS_SMOD_Counts.parquet"
good_schema = pd.read_parquet(good_schema_example)
good_schema.head()

FileNotFoundError: wbg-geography01/Space2Stats/parquet/GLOBAL/GHS_SMOD_Counts.parquet

The table schema for tables to be imported into Space2Stats is simple with only two columns required:
- hex_id: the h3 grid id for the polygon used in the zonal calculation; default is h3 level 6
- sum_pop_f_0_2020: attribute of interest to be ingested, should be written in the following style __f'{aggregation_method}_{attribute_name}'__

# Combine output CSV files into single parquet

In [None]:
#Search for all relevant CSV files
bucket = 'wbg-geography01' 
region = 'us-east-1'
s3client = boto3.client('s3', region_name=region, verify=False)

# Loop through the S3 bucket and get all the keys for files that are .csv 
prefix = 'Space2Stats/h3_stats_data/GLOBAL/Urbanization_Pop/'
more_results = True
loops = 0
verbose=True
good_res = []
while more_results:
    if verbose:
        print(f"Completed loop: {loops}")
    if loops > 0:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=token)
    else:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix)
    more_results = objects['IsTruncated']
    if more_results:
        token = objects['NextContinuationToken']
    loops += 1
    for res in objects['Contents']:
        if res['Key'].endswith('.csv'):            
            cur_variable = res['Key'].split("/")[-3]
            good_res.append(res['Key'])

# Loop through the S3 bucket and get all the keys for files that are .csv 
prefix = 'Space2Stats/h3_stats_data/GLOBAL/Urbanization/'
more_results = True
loops = 0
verbose=True
good_res_u = []
while more_results:
    if verbose:
        print(f"Completed loop: {loops}")
    if loops > 0:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=token)
    else:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix)
    more_results = objects['IsTruncated']
    if more_results:
        token = objects['NextContinuationToken']
    loops += 1
    for res in objects['Contents']:
        if res['Key'].endswith('.csv'):            
            cur_variable = res['Key'].split("/")[-3]
            good_res_u.append(res['Key'])

In [None]:
all_res_u = []
i = 0
for in_file in good_res_u:
    full_path = f's3://{bucket}/{in_file}'
    curD = pd.read_csv(full_path, index_col=0)
    curD.set_index('id', inplace=True)
    curD.columns = [x.replace("c_", "ghs_") + "_count" for x in curD.columns]
    curD['ghs_total_count'] = curD.apply(lambda x: x.sum(), axis=1)    
    all_res_u.append(curD)
    i += 1
    tPrint(f"Completed {i} of {len(good_res_u)}")

In [None]:
all_res = []
i = 0
for in_file in good_res:
    full_path = f's3://{bucket}/{in_file}'
    curD = pd.read_csv(full_path)
    curD = curD.loc[:,[x for x in curD.columns if "SUM" in x]]
    curD.columns = ["ghs_" + x.replace("_SUM", "_pop") for x in curD.columns]
    curD['ghs_total_pop'] = curD.apply(lambda x: x.sum(), axis=1)
    all_res.append(curD)
    i += 1
    tPrint(f"Completed {i} of {len(good_res)}")

In [None]:
final_res = pd.concat(all_res)
final_res_u = pd.concat(all_res_u)

In [5]:
# Rename columns
s3_path = "s3://wbg-geography01/Space2Stats/parquet/GLOBAL/GHS_SMOD_counts_and_pop.parquet"
inD = pd.read_parquet(s3_path)
inD_columns = [x.lower() for x in inD.columns]
inD.columns = inD_columns
inD.to_parquet(s3_path)
