# Calculating ADM results based on hexagons

While most of our work has focused on the consistent, global, hexagon grid, the ultimate goal of the S2S work program is to generate a database of geospatial aggregates at the administrative level 2. **We are still in the process of acquiring the admin bounds we are going to publish.**

This notebook takes the results of the __ADM_map_adm2_h3xid.ipynb__ establishing the connection between adm2 features and hexids and uses the overlap to calculate adm2 summaries.

In [1]:
import sys, os, time

import pandas as pd

from GOSTrocks.misc import tPrint
from space2stats_client import Space2StatsClient
from tqdm.notebook import tqdm

import requests
requests.packages.urllib3.disable_warnings()

In [2]:
# a different script (ADM_map_adm2_h3xid.ipynb) generates country-level breakdowns of the overlap between hex ids and adm2 boundaries
base_folder = r"C:\WBG\Work\S2S"
unq_id = 'ADM2CD_c'
admin_folder = os.path.join(base_folder, "data", 'admin')
hex_folder = os.path.join(base_folder, 'ADM_RESULTS')
out_folder = os.path.join(base_folder, "ADM_ZONAL_RESULTS")
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

hex_files = [os.path.join(hex_folder, f) for f in os.listdir(hex_folder) if f.endswith('.csv')]
tempD = pd.read_csv(hex_files[0])
tempD.head()

Unnamed: 0,ADM2CD_c,cell,overlap,cntry
0,ABW001001,866773607ffffff,0.967263,ABW
1,ABW001001,86677360fffffff,0.493143,ABW
2,ABW001001,866773617ffffff,0.265589,ABW
3,ABW001001,86677361fffffff,0.003464,ABW
4,ABW001001,866773627ffffff,0.392555,ABW


In [3]:
# set up the S2S client
client = Space2StatsClient(verify_ssl=False)
topics = client.get_topics()
cur_fields = client.get_fields()

In [4]:
call_size = 10000

for cur_topic in list(topics.index):
    cur_info = client.get_properties(cur_topic)
    cur_fields = list(cur_info['name'].values[1:])
    country_res = []
    for hex_file in tqdm(hex_files, desc=cur_topic):
        lbl = os.path.basename(hex_file).split('_')[0]
        curD = pd.read_csv(hex_file)
            
        #Loop through the unique hex ids in groups of call_size
        unq_hex = list(set(curD['cell']))
        all_hexes = []
        for i in range(0, len(unq_hex), call_size):
            #tPrint(f"Hexes {i} to {i + call_size} of {len(unq_hex)}: {cur_topic}")
            cur_hex = client.get_summary_by_hexids(unq_hex[i:i + call_size], cur_fields)
            all_hexes.append(cur_hex)
            #time.sleep(1)
        # Merge the results
        cur_hex = pd.concat(all_hexes)    
        country_hex = pd.merge(curD, cur_hex, left_on='cell', right_on='hex_id', how='left')
        results = country_hex.apply(lambda x: x[cur_fields] * x['overlap'], axis=1)
        results[unq_id] = curD[unq_id]
        adm_results = results.groupby(unq_id).sum().reset_index()    
        country_res.append(adm_results)   

    final_data = pd.concat(country_res)
    final_data['ISO3'] = final_data[unq_id].apply(lambda x: x[:3])
    final_data.to_csv(os.path.join(out_folder, f'S2S_adm2_{cur_topic}.csv'), index=False)
    

space2stats_population_2020:   0%|          | 0/231 [00:00<?, ?it/s]

ChunkedEncodingError: ('Connection broken: IncompleteRead(1920829 bytes read, 1547751 more expected)', IncompleteRead(1920829 bytes read, 1547751 more expected))

In [None]:
def get_aggregates(cur_data):
    num_data = cur_data.loc[:,cur_fields] * cur_data['overlap']
    num_data = num_data.sum(axis=0)
    result = pd.Series(num_data)
    result['gID'] = cur_data['gID'].iloc[0]
    return(result)

curD = pd.read_csv(hex_files[2])

country_res = []
for lbl, cur_data in curD.groupby('gID'):
    cur_hex = client.get_summary_by_hexids(list(set(cur_data['cell'])), cur_fields)
    country_hex = pd.merge(curD, cur_hex, left_on='cell', right_on='hex_id', how='left')
    results = country_hex.apply(lambda x: x[cur_fields] * x['overlap'], axis=1).sum(axis=0)
    results['gID'] = lbl
    country_res.append(results)
    tPrint(lbl)


In [None]:
def get_aggregates(cur_data):
    num_data = cur_data.loc[:,cur_fields] * cur_data['overlap']
    num_data = num_data.sum(axis=0)
    result = pd.Series(num_data)
    result['gID'] = cur_data['gID'].iloc[0]
    return(result)

country_hex = pd.merge(curD, cur_hex, left_on='cell', right_on='hex_id', how='left')
country_hex.groupby('gID').apply(get_aggregates)

## The request issue with the S2S database

In [None]:
country_file = hex_files[3]
print(country_file)
curD = pd.read_csv(country_file)
# When trying to fetch all the fields (84) for all the unique hex IDs in Angola (31351)
#### The server request times out after ~ 25 seconds
print(len(set(curD['cell'])))
print(len(cur_fields))
cur_hex = client.get_summary_by_hexids(list(set(curD['cell'])), cur_fields)

In [None]:
# However, when fetching the fields in batches of 10, it works
all_hexes = []
for i in range(0, len(cur_fields), 10):
    tPrint(f"Fetching fields {i} to {i + 10}")
    cur_fields_batch = cur_fields[i:i + 10]
    cur_hex = client.get_summary_by_hexids(list(set(curD['cell'])), cur_fields_batch)
    time.sleep(1)  # to avoid overwhelming the server
