# Exploring S2S input schema

In order to ingest data into the S2S database, it needs to follow a specific schema. This notebook will explore and document that schema

In [1]:
import sys
import os
import itertools
import rasterio
import boto3

import geopandas as gpd
import pandas as pd
import numpy as np

from scipy.spatial import cKDTree
from shapely.geometry import Point
from operator import itemgetter

sys.path.append("/home/wb411133/Code/GOSTrocks/src")

import GOSTrocks.ntlMisc as ntlMisc
import GOSTrocks.rasterMisc as rMisc
from GOSTrocks.misc import tPrint



In [2]:
good_schema_example = "s3://wbg-geography01/Space2Stats/parquet/GLOBAL/combined_population.parquet"
good_schema = pd.read_parquet(good_schema_example)
good_schema.head()

Unnamed: 0,hex_id,sum_pop_f_0_2020,sum_pop_f_10_2020,sum_pop_f_15_2020,sum_pop_f_1_2020,sum_pop_f_20_2020,sum_pop_f_25_2020,sum_pop_f_30_2020,sum_pop_f_35_2020,sum_pop_f_40_2020,...,sum_pop_m_40_2020,sum_pop_m_45_2020,sum_pop_m_50_2020,sum_pop_m_55_2020,sum_pop_m_5_2020,sum_pop_m_60_2020,sum_pop_m_65_2020,sum_pop_m_70_2020,sum_pop_m_75_2020,sum_pop_m_80_2020
0,860000007ffffff,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,86000000fffffff,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,860000017ffffff,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,86000001fffffff,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,860000027ffffff,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


The table schema for tables to be imported into Space2Stats is simple with only two columns required:
- hex_id: the h3 grid id for the polygon used in the zonal calculation; default is h3 level 6
- sum_pop_f_0_2020: attribute of interest to be ingested, should be written in the following style __f'{aggregation_method}_{attribute_name}'__

## Create table of nighttime lights values

In [3]:
# This contains nighttime light zonal stats for all individual months 
# Define the AWS variables
# Define S3 parameters
bucket = 'wbg-geography01' 
prefix = 'Space2Stats/parquet/GLOBAL/NTL_VIIRS_LEN/'
region = 'us-east-1'
s3client = boto3.client('s3', region_name=region)

# Loop through the S3 bucket and get all the keys for files that are .tif 
more_results = True
loops = 0
verbose=True
good_res = []
while more_results:
    if verbose:
        print(f"Completed loop: {loops}")
    if loops > 0:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=token)
    else:
        objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix)
    more_results = objects['IsTruncated']
    if more_results:
        token = objects['NextContinuationToken']
    loops += 1
    for res in objects['Contents']:
        if res['Key'].endswith('parquet'):            
            cur_variable = res['Key'].split("/")[-3]
            good_res.append(res['Key'])
            

Completed loop: 0


In [None]:
all_res = []
for in_file in good_res:
    full_path = f's3://{bucket}/{in_file}'
    date = os.path.basename(in_file).split("_")[2][:6]
    img_type = os.path.basename(in_file).split("_")[1]
    if img_type == 'npp':
        tPrint(date)
        curD = pd.read_parquet(full_path)
        date = os.path.basename(in_file).split("_")[2][:6]
        # Name the columns according to the standards above
        columns = [f'{x}_VIIRS_NTL_{date}' for x in curD.columns]
        columns[-1] = 'hex_id'
        curD.columns = columns
        curD = curD.set_index('hex_id')
        # Convert the previous nodata values (-1) to NoData
        curD[curD<0] = None
        all_res.append(curD.copy())

11:28:23	201201
11:28:52	201202
11:29:23	201203
11:29:53	201204
11:30:18	201205
11:30:47	201206
11:31:18	201207
11:31:48	201208
11:32:16	201209
11:32:43	201210
11:33:11	201211
11:33:40	201212
11:34:11	201301
11:34:41	201302
11:35:08	201303
11:35:35	201304
11:36:04	201305
11:36:31	201306
11:36:59	201307
11:37:27	201308
11:37:53	201309
11:38:20	201310
11:38:49	201311
11:39:16	201312
11:39:45	201401
11:40:12	201402
11:40:41	201404
11:41:09	201405
11:41:38	201406
11:42:04	201407
11:42:30	201408
11:42:57	201409
11:43:27	201410
11:43:54	201411
11:44:21	201412
11:44:48	201501
11:45:15	201502
11:45:41	201503
11:46:08	201504
11:46:35	201505
11:47:02	201506
11:47:28	201507
11:47:54	201508
11:48:21	201509
11:48:46	201510
11:49:11	201511
11:49:37	201512
11:50:03	201601
11:50:27	201602
11:50:53	201603
11:51:19	201604
11:51:46	201605
11:52:14	201606
11:52:39	201607
11:53:05	201608
11:53:33	201609
11:54:00	201610
11:54:27	201611
11:54:55	201612
11:55:20	201701
11:55:46	201702
11:56:10	201703
11:56:38

In [None]:
### This causes memory errors :(
sum_res = []
min_res = []
max_res = []
mean_res = []
for c_res in all_res:
    sum_res.append(c_res.loc[:,[x for x in c_res.columns if "SUM" in x]].copy())
    min_res.append(c_res.loc[:,[x for x in c_res.columns if "MIN" in x]].copy())
    max_res.append(c_res.loc[:,[x for x in c_res.columns if "MAX" in x]].copy())
    mean_res.append(c_res.loc[:,[x for x in c_res.columns if "MEAN" in x]].copy())

i = 0
for c_res in sum_res:
    tPrint(i)
    try:
        final = final.join(c_res)
    except:
        final = c_res
    i += 1