# Exploring S2S input schema

In order to ingest data into the S2S database, it needs to follow a specific schema. This notebook will explore and document that schema

In [1]:
import sys
import os
import itertools
import rasterio
import boto3

import geopandas as gpd
import pandas as pd
import numpy as np

from scipy.spatial import cKDTree
from shapely.geometry import Point
from operator import itemgetter

sys.path.append("/home/wb411133/Code/GOSTrocks/src")

import GOSTrocks.ntlMisc as ntlMisc
import GOSTrocks.rasterMisc as rMisc
from GOSTrocks.misc import tPrint



In [7]:
good_schema_example = "s3://wbg-geography01/Space2Stats/parquet/GLOBAL/combined_population_sample.parquet"
good_schema = pd.read_parquet(good_schema_example)
good_schema.head()

Unnamed: 0,hex_id,sum_pop_f_0_2020,sum_pop_f_10_2020
0,860000007ffffff,-1.0,-1.0
1,86000000fffffff,-1.0,-1.0
2,860000017ffffff,-1.0,-1.0
3,86000001fffffff,-1.0,-1.0
4,860000027ffffff,-1.0,-1.0


The table schema for tables to be imported into Space2Stats is simple with only two columns required:
- hex_id: the h3 grid id for the polygon used in the zonal calculation; default is h3 level 6
- sum_pop_f_0_2020: attribute of interest to be ingested, should be written in the following style __f'{aggregation_method}_{attribute_name}'__

# Combine output CSV files into single parquet

In [3]:
# This contains nighttime light zonal stats for all individual months 
# Define the AWS variables
# Define S3 parameters
bucket = 'wbg-geography01' 
prefix = 'Space2Stats/h3_stats_data/GLOBAL/Urbanization/'
region = 'us-east-1'
s3client = boto3.client('s3', region_name=region)

# Loop through the S3 bucket and get all the keys for files that are .tif 
more_results = True
loops = 0
verbose=True
good_res = []
while more_results:
    if verbose:
        print(f"Completed loop: {loops}")
    if loops > 0:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=token)
    else:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix)
    more_results = objects['IsTruncated']
    if more_results:
        token = objects['NextContinuationToken']
    loops += 1
    for res in objects['Contents']:
        if res['Key'].endswith('.csv'):            
            cur_variable = res['Key'].split("/")[-3]
            good_res.append(res['Key'])
            

Completed loop: 0


In [5]:
all_res = []
i = 0
for in_file in good_res:
    full_path = f's3://{bucket}/{in_file}'
    curD = pd.read_csv(full_path)
    all_res.append(curD)
    i += 1
    tPrint(f"Completed {i} of {len(good_res)}")

final_res = pd.concat(all_res)
final_res.drop(['Unnamed: 0'], axis=1).to_parquet(f's3://{bucket}/Space2Stats/parquet/GLOBAL/GHS_SMOD_Counts.parquet')

11:31:31	Completed 1 of 842
11:31:32	Completed 2 of 842
11:31:32	Completed 3 of 842
11:31:32	Completed 4 of 842
11:31:32	Completed 5 of 842
11:31:32	Completed 6 of 842
11:31:33	Completed 7 of 842
11:31:33	Completed 8 of 842
11:31:33	Completed 9 of 842
11:31:33	Completed 10 of 842
11:31:33	Completed 11 of 842
11:31:33	Completed 12 of 842
11:31:34	Completed 13 of 842
11:31:34	Completed 14 of 842
11:31:34	Completed 15 of 842
11:31:34	Completed 16 of 842
11:31:35	Completed 17 of 842
11:31:35	Completed 18 of 842
11:31:35	Completed 19 of 842
11:31:35	Completed 20 of 842
11:31:35	Completed 21 of 842
11:31:36	Completed 22 of 842
11:31:36	Completed 23 of 842
11:31:36	Completed 24 of 842
11:31:36	Completed 25 of 842
11:31:36	Completed 26 of 842
11:31:37	Completed 27 of 842
11:31:37	Completed 28 of 842
11:31:37	Completed 29 of 842
11:31:37	Completed 30 of 842
11:31:37	Completed 31 of 842
11:31:38	Completed 32 of 842
11:31:38	Completed 33 of 842
11:31:38	Completed 34 of 842
11:31:38	Completed 35 o

## Create table of nighttime lights values

In [None]:
# This contains nighttime light zonal stats for all individual months 
# Define the AWS variables
# Define S3 parameters
bucket = 'wbg-geography01' 
prefix = 'Space2Stats/parquet/GLOBAL/Urbanization/'
region = 'us-east-1'
s3client = boto3.client('s3', region_name=region)

# Loop through the S3 bucket and get all the keys for files that are .tif 
more_results = True
loops = 0
verbose=True
good_res = []
while more_results:
    if verbose:
        print(f"Completed loop: {loops}")
    if loops > 0:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=token)
    else:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix)
    more_results = objects['IsTruncated']
    if more_results:
        token = objects['NextContinuationToken']
    loops += 1
    for res in objects['Contents']:
        if res['Key'].endswith('parquet'):            
            cur_variable = res['Key'].split("/")[-3]
            good_res.append(res['Key'])
            

In [None]:
all_res = []
for in_file in good_res:
    full_path = f's3://{bucket}/{in_file}'
    date = os.path.basename(in_file).split("_")[2][:6]
    img_type = os.path.basename(in_file).split("_")[1]
    if img_type == 'npp':
        tPrint(date)
        curD = pd.read_parquet(full_path)
        date = os.path.basename(in_file).split("_")[2][:6]
        # Name the columns according to the standards above
        columns = [f'{x}_VIIRS_NTL_{date}' for x in curD.columns]
        columns[-1] = 'hex_id'
        curD.columns = columns
        curD = curD.set_index('hex_id')
        # Convert the previous nodata values (-1) to NoData
        curD[curD<0] = None
        all_res.append(curD.copy())

In [None]:
f's3://{bucket}/{prefix[:-1]}_combined.parquet'

In [None]:
# loop through the features of the datasets and write iteravely to a parquet file
out_parquet = f's3://{bucket}/{prefix[:-1]}_combined.parquet'

for c_res in all_res


In [None]:
c_res = all_res[0]
c_res.columns[0][-6:-2]

In [None]:
### This causes memory errors :(
sum_res = []
min_res = []
max_res = []
mean_res = []
annual_sums = {}
for c_res in all_res:
    #sum_res.append(c_res.loc[:,[x for x in c_res.columns if "SUM" in x]].copy())
    #min_res.append(c_res.loc[:,[x for x in c_res.columns if "MIN" in x]].copy())
    #max_res.append(c_res.loc[:,[x for x in c_res.columns if "MAX" in x]].copy())
    #mean_res.append(c_res.loc[:,[x for x in c_res.columns if "MEAN" in x]].copy())
    year = c_res.columns[0][-6:-2]
    try:
        annual_sums[year].append(c_res)
    except:
        annual_sums[year] = [c_res]    

    


In [None]:
annual_sums.keys()

In [None]:
tPrint("*** Start")
for year, cur_data in annual_sums.items():
    out_parquet = f's3://{bucket}/{prefix[:-1]}_{year}_combined.parquet'
    yearly_res = pd.concat(cur_data, axis=1)
    yearly_res.to_parquet(out_parquet)
    tPrint(year)
