# Generate parquet for ingestion

In order to ingest data into the S2S database, it needs to follow a specific schema. This notebook will explore that schema and combine the various CSV files into a final output parquet file.

In [3]:
import sys
import os
import itertools
import boto3

import geopandas as gpd
import pandas as pd
import numpy as np

from scipy.spatial import cKDTree
from shapely.geometry import Point
from operator import itemgetter

sys.path.append("../../../gostrocks/src")

from GOSTrocks.misc import tPrint

In [4]:
good_schema_example = "s3://wbg-geography01/Space2Stats/parquet/GLOBAL/GHS_SMOD_Counts_and_Pop.parquet"
good_schema = pd.read_parquet(good_schema_example)
good_schema.head()

Unnamed: 0_level_0,11_POP,12_POP,13_POP,21_POP,22_POP,23_POP,30_POP,TOTAL_POP,COUNT_11,COUNT_12,COUNT_13,COUNT_21,COUNT_22,COUNT_23,COUNT_30
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
86001118fffffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
8600066d7ffffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
860006c0fffffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
860010adfffffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0
8600244cfffffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0


In [5]:
[print(x) for x in good_schema.columns]

Index(['11_POP', '12_POP', '13_POP', '21_POP', '22_POP', '23_POP', '30_POP',
       'TOTAL_POP', 'COUNT_11', 'COUNT_12', 'COUNT_13', 'COUNT_21', 'COUNT_22',
       'COUNT_23', 'COUNT_30'],
      dtype='object')

The table schema for tables to be imported into Space2Stats is simple with only two columns required:
- hex_id: the h3 grid id for the polygon used in the zonal calculation; default is h3 level 6
- sum_pop_f_0_2020: attribute of interest to be ingested, should be written in the following style __f'{aggregation_method}_{attribute_name}'__

# Combine output CSV files into single parquet

In [8]:
#Search for all relevant CSV files
bucket = 'wbg-geography01' 
prefix = 'Space2Stats/h3_stats_data/GLOBAL/Urbanization_Pop/'
region = 'us-east-1'
s3client = boto3.client('s3', region_name=region, verify=False)

# Loop through the S3 bucket and get all the keys for files that are .csv 
more_results = True
loops = 0
verbose=True
good_res = []
while more_results:
    if verbose:
        print(f"Completed loop: {loops}")
    if loops > 0:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=token)
    else:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix)
    more_results = objects['IsTruncated']
    if more_results:
        token = objects['NextContinuationToken']
    loops += 1
    for res in objects['Contents']:
        if res['Key'].endswith('.csv'):            
            cur_variable = res['Key'].split("/")[-3]
            good_res.append(res['Key'])

Completed loop: 0




In [29]:
all_res = []
i = 0
for in_file in good_res:
    full_path = f's3://{bucket}/{in_file}'
    curD = pd.read_csv(full_path)
    curD.set_index('id', inplace=True)
    curD = curD.loc[:,[x for x in curD.columns if "SUM" in x]]
    curD['TOTAL_POP'] = curD.apply(lambda x: x.sum(), axis=1)
    all_res.append(curD)
    i += 1
    tPrint(f"Completed {i} of {len(good_res)}")

final_res = pd.concat(all_res)
#final_res.drop(['Unnamed: 0'], axis=1).to_parquet(f's3://{bucket}/Space2Stats/parquet/GLOBAL/GHS_SMOD_Counts.parquet')

10:39:22	Completed 1 of 842
10:39:23	Completed 2 of 842
10:39:23	Completed 3 of 842
10:39:24	Completed 4 of 842
10:39:24	Completed 5 of 842
10:39:25	Completed 6 of 842
10:39:25	Completed 7 of 842
10:39:26	Completed 8 of 842
10:39:27	Completed 9 of 842
10:39:27	Completed 10 of 842
10:39:28	Completed 11 of 842
10:39:28	Completed 12 of 842
10:39:29	Completed 13 of 842
10:39:29	Completed 14 of 842
10:39:30	Completed 15 of 842
10:39:30	Completed 16 of 842
10:39:31	Completed 17 of 842
10:39:31	Completed 18 of 842
10:39:32	Completed 19 of 842
10:39:33	Completed 20 of 842
10:39:33	Completed 21 of 842
10:39:34	Completed 22 of 842
10:39:34	Completed 23 of 842
10:39:35	Completed 24 of 842
10:39:35	Completed 25 of 842
10:39:36	Completed 26 of 842
10:39:36	Completed 27 of 842
10:39:37	Completed 28 of 842
10:39:38	Completed 29 of 842
10:39:38	Completed 30 of 842
10:39:39	Completed 31 of 842
10:39:39	Completed 32 of 842
10:39:40	Completed 33 of 842
10:39:41	Completed 34 of 842
10:39:41	Completed 35 o

In [36]:
good_schema.set_index('id', inplace=True)
good_schema.columns = [x.replace("c", "COUNT") for x in good_schema.columns]
final_res.columns = [x.replace("SUM", "POP") for x in final_res.columns]

In [2]:
# For the population data, merge the GHS SMOD counts and the population data
all_data = final_res.merge(good_schema, left_index=True, right_index=True, how='left')
all_data.head()

NameError: name 'final_res' is not defined

In [38]:
all_data.to_parquet("s3://wbg-geography01/Space2Stats/parquet/GLOBAL/GHS_SMOD_Counts_and_Pop.parquet")

In [1]:
all_data.columns

NameError: name 'all_data' is not defined