In [2]:
%reload_ext autoreload
%autoreload 2

In [1]:

import sys
sys.path.append("../../")
from pathlib import Path
import uuid
import numpy as np
import geopandas as gpd
import pandas as pd
from shapely import wkt

from povertymapping import settings, osm, ookla, nightlights
from povertymapping.osm import OsmDataManager
from povertymapping.ookla import OoklaDataManager
import getpass


import pickle
import os
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns

### Set global parameters


In [18]:
# Set country-specific variables
REGION = 'timor-leste'
country_osm = "east-timor"
ookla_year = 2019
nightlights_year = 2017

In [4]:
# Model to use for prediction
MODEL_SAVE_PATH = Path('../notebooks/2023-01-17-initial-model-ph-mm-tl-kh/model_tl.pkl')

### Load Per Country Populated Grids 

In [5]:
%%time
admin_grids_gdf = gpd.read_file(f'{REGION}_populated_grids.geojson')

### Set up Data Access

In [6]:
# Instantiate data managers for Ookla and OSM
# This auto-caches requested data in RAM, so next fetches of the data are faster.
# osm_data_manager = OsmDataManager(cache_dir=settings.ROOT_DIR/"data/data_cache")
osm_data_manager = OsmDataManager()
# ookla_data_manager = OoklaDataManager(cache_dir=settings.ROOT_DIR/"data/data_cache")
ookla_data_manager = OoklaDataManager()

In [15]:
# Log-in using EOG credentials
username = os.environ.get('EOG_USER',None)
username = username if username is not None else input('Username?')
password = os.environ.get('EOG_PASSWORD',None)
password = password if password is not None else getpass.getpass('Password?') 


# set save_token to True so that access token gets stored in ~/.eog_creds/eog_access_token
access_token = nightlights.get_eog_access_token(username,password, save_token=True)

Username? butch@thinkingmachin.es
Password? ········


2023-02-20 11:34:03.805 | INFO     | povertymapping.nightlights:get_eog_access_token:48 - Saving access_token to ~/.eog_creds/eog_access_token
2023-02-20 11:34:03.808 | INFO     | povertymapping.nightlights:get_eog_access_token:56 - Adding access token to environmentt var EOG_ACCESS_TOKEN


## Generate Base Features

If this is your first time running this notebook for this specific area, expect a long runtime for the following cell as it will download and cache the ff. datasets from the internet. 

- OpenStreetMap Data from Geofabrik
- Ookla Internet Speed Data
- VIIRS nighttime lights data from NASA EOG

On subsequent runs, the runtime will be much faster as the data is already stored in your filesystem. 

In [8]:
country_data = admin_grids_gdf.copy()

In [9]:
%%time
# Add in OSM features
country_data = osm.add_osm_poi_features(country_data, country_osm, osm_data_manager)

2023-02-20 11:25:10.603 | INFO     | povertymapping.osm:download_osm_country_data:187 - OSM Data: Cached data available for east-timor at /home/butchtm/.geowrangler/osm/east-timor? False
2023-02-20 11:25:10.605 | INFO     | povertymapping.osm:download_osm_country_data:193 - OSM Data: Re-initializing OSM country cache dir at /home/butchtm/.geowrangler/osm/east-timor...
2023-02-20 11:25:10.608 | INFO     | povertymapping.osm:download_osm_country_data:201 - OSM Data: Downloading Geofabrik zip file...
2023-02-20 11:25:14.181 | INFO     | geowrangler.datasets.utils:urlretrieve:25 - Retrieving https://download.geofabrik.de/asia/east-timor-latest-free.shp.zip into /home/butchtm/.geowrangler/osm/east-timor/east-timor-latest-free.shp.zip


2023-02-20 11:25:19.111 | INFO     | povertymapping.osm:download_osm_country_data:205 - OSM Data: Unzipping the zip file...
2023-02-20 11:25:19.504 | INFO     | povertymapping.osm:download_osm_country_data:212 - OSM Data: Successfully downloaded and cached OSM data for east-timor at /home/butchtm/.geowrangler/osm/east-timor!
2023-02-20 11:25:19.505 | DEBUG    | povertymapping.osm:load_pois:149 - OSM POIs for east-timor being loaded from /home/butchtm/.geowrangler/osm/east-timor/gis_osm_pois_free_1.shp


CPU times: user 2.28 s, sys: 271 ms, total: 2.56 s
Wall time: 10.4 s


In [10]:

%%time
country_data = osm.add_osm_road_features(country_data, country_osm, osm_data_manager)

2023-02-20 11:25:31.208 | INFO     | povertymapping.osm:download_osm_country_data:187 - OSM Data: Cached data available for east-timor at /home/butchtm/.geowrangler/osm/east-timor? True
2023-02-20 11:25:31.216 | DEBUG    | povertymapping.osm:load_roads:168 - OSM Roads for east-timor being loaded from /home/butchtm/.geowrangler/osm/east-timor/gis_osm_roads_free_1.shp


CPU times: user 814 ms, sys: 38.6 ms, total: 853 ms
Wall time: 854 ms


In [11]:

%%time
# Add in Ookla features
country_data = ookla.add_ookla_features(country_data, 'fixed', ookla_year, ookla_data_manager)

2023-02-20 11:25:47.856 | DEBUG    | povertymapping.ookla:load_type_year_data:68 - Contents of data cache: []
2023-02-20 11:25:47.857 | INFO     | povertymapping.ookla:load_type_year_data:83 - Cached data available at /home/butchtm/.geowrangler/ookla/processed/9ffb4fa270d3223649715458afdaedc5.csv? False
2023-02-20 11:25:47.858 | DEBUG    | povertymapping.ookla:load_type_year_data:100 - No cached data found. Processing Ookla data from scratch.
2023-02-20 11:25:48.831 | INFO     | povertymapping.ookla:download_ookla_year_data:173 - Ookla Data: Number of available files for fixed and 2019: 4
2023-02-20 11:25:48.833 | INFO     | povertymapping.ookla:download_ookla_year_data:186 - Ookla Data: Cached data available for fixed and 2019 at /home/butchtm/.geowrangler/ookla/fixed/2019? False
2023-02-20 11:25:48.834 | INFO     | povertymapping.ookla:download_ookla_year_data:192 - Ookla Data: Re-initializing Ookla type/year cache dir at /home/butchtm/.geowrangler/ookla/fixed/2019...
2023-02-20 11:2

CPU times: user 1min 3s, sys: 27 s, total: 1min 30s
Wall time: 3min 34s


In [12]:

%%time
country_data = ookla.add_ookla_features(country_data, 'mobile', ookla_year, ookla_data_manager)

2023-02-20 11:29:34.921 | DEBUG    | povertymapping.ookla:load_type_year_data:68 - Contents of data cache: ['9ffb4fa270d3223649715458afdaedc5']
2023-02-20 11:29:34.923 | INFO     | povertymapping.ookla:load_type_year_data:83 - Cached data available at /home/butchtm/.geowrangler/ookla/processed/839f98fa5bce8308017ede7966cc46f8.csv? False
2023-02-20 11:29:34.924 | DEBUG    | povertymapping.ookla:load_type_year_data:100 - No cached data found. Processing Ookla data from scratch.
2023-02-20 11:29:34.926 | INFO     | povertymapping.ookla:download_ookla_year_data:173 - Ookla Data: Number of available files for mobile and 2019: 4
2023-02-20 11:29:34.928 | INFO     | povertymapping.ookla:download_ookla_year_data:186 - Ookla Data: Cached data available for mobile and 2019 at /home/butchtm/.geowrangler/ookla/mobile/2019? False
2023-02-20 11:29:34.930 | INFO     | povertymapping.ookla:download_ookla_year_data:192 - Ookla Data: Re-initializing Ookla type/year cache dir at /home/butchtm/.geowrangle

CPU times: user 51.4 s, sys: 8.63 s, total: 1min
Wall time: 2min 24s


In [20]:

%%time
# Add in the nighttime lights features
country_data = nightlights.generate_nightlights_feature(country_data, str(nightlights_year)) 

2023-02-20 12:12:04.537 | INFO     | povertymapping.nightlights:generate_clipped_raster:365 - Using viirs global file as source raster: /home/butchtm/.geowrangler/nightlights/global/VNL_v21_npp_2017_global_vcmslcfg_c202205302300.average.dat.tif
2023-02-20 12:12:04.540 | INFO     | povertymapping.nightlights:clip_raster:232 - Generating clipped raster file from /home/butchtm/.geowrangler/nightlights/global/VNL_v21_npp_2017_global_vcmslcfg_c202205302300.average.dat.tif to /home/butchtm/.geowrangler/nightlights/clip/48e4902840dc8b91dc0de929a34fe6fd.tif with bounds [124.0356445   -9.51407931 127.30957028  -8.12449133] and buffer 0.1
2023-02-20 12:12:05.358 | INFO     | povertymapping.nightlights:generate_clipped_metadata:393 - Adding metadata.json file /home/butchtm/.geowrangler/nightlights/clip/48e4902840dc8b91dc0de929a34fe6fd.metadata.json


CPU times: user 6.03 s, sys: 370 ms, total: 6.4 s
Wall time: 6.53 s


### Inspect the combined target country data

In [21]:
country_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2024 entries, 0 to 2023
Data columns (total 69 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   quadkey                            2024 non-null   object  
 1   shapeName                          2024 non-null   object  
 2   shapeISO                           2024 non-null   object  
 3   shapeID                            2024 non-null   object  
 4   shapeGroup                         2024 non-null   object  
 5   shapeType                          2024 non-null   object  
 6   pop_count                          2024 non-null   float64 
 7   geometry                           2024 non-null   geometry
 8   poi_count                          2024 non-null   float64 
 9   atm_count                          2024 non-null   float64 
 10  atm_nearest                        2024 non-null   float64 
 11  bank_count                         

In [22]:
country_data.head()

Unnamed: 0,quadkey,shapeName,shapeISO,shapeID,shapeGroup,shapeType,pop_count,geometry,poi_count,atm_count,...,mobile_2019_mean_avg_d_kbps_mean,mobile_2019_mean_avg_u_kbps_mean,mobile_2019_mean_avg_lat_ms_mean,mobile_2019_mean_num_tests_mean,mobile_2019_mean_num_devices_mean,avg_rad_min,avg_rad_max,avg_rad_mean,avg_rad_std,avg_rad_median
0,31011220203121,Nitibe,,TLS-ADM2-3_0_0-B58,TLS,ADM2,102.251936,"POLYGON ((124.03564 -9.34067, 124.03564 -9.318...",0.0,0.0,...,,,,,,0.112404,0.179112,0.151455,0.018153,0.149703
1,31011220203123,Nitibe,,TLS-ADM2-3_0_0-B58,TLS,ADM2,992.492772,"POLYGON ((124.03564 -9.36235, 124.03564 -9.340...",0.0,0.0,...,,,,,,0.145907,0.35022,0.223909,0.051034,0.207525
2,31011220203130,Nitibe,,TLS-ADM2-3_0_0-B58,TLS,ADM2,118.8976,"POLYGON ((124.05762 -9.34067, 124.05762 -9.318...",0.0,0.0,...,,,,,,0.129712,0.187825,0.152679,0.013343,0.151096
3,31011220203132,Nitibe,,TLS-ADM2-3_0_0-B58,TLS,ADM2,513.637632,"POLYGON ((124.05762 -9.36235, 124.05762 -9.340...",0.0,0.0,...,,,,,,0.151849,0.301968,0.195619,0.033749,0.190747
4,31011220203310,Nitibe,,TLS-ADM2-3_0_0-B58,TLS,ADM2,319.14094,"POLYGON ((124.05762 -9.38403, 124.05762 -9.362...",0.0,0.0,...,,,,,,0.131346,0.249318,0.173748,0.027273,0.167033


## Data Preparation

### Split into labels and features

In [31]:
# For features, drop all columns from the input country geometries
# If you need the cluster data, refer to country_data / country_test
input_grid_cols = admin_grids_gdf.columns

In [23]:
features = country_data.drop(input_grid_cols, axis=1)

features.shape

(2024, 61)

In [24]:
# Clean features
# For now, just impute nans with 0
# TODO: Implement other cleaning steps
features = features.fillna(0)


### Base Features List

The features can be subdivided by the source dataset

#### OSM
- `<poi type>_count`: number of points of interest (POI) of a specified type in that area
    - ex. atm_count: number of atms in cluster
    - poi_count: number of all POIs *of all types* in cluster 
- `<poi_type>_nearest`: distance of nearest POI of the specified type
    - ex. atm_nearest: distance of nearest ATM from that cluster
- OSM POI types included: `atm`, `bank`, `bus_stations`, `cafe`, `charging_station`, `courthouse`, `dentist` (clinic), `fast_food`, `fire_station`, `food_court`, `fuel` (gas station), `hospital`, `library`, `marketplace`, `pharmacy`, `police`, `post_box`, `post_office`, `restaurant`, `social_facility`, `supermarket`, `townhall`, `road`

#### Ookla 
The network metrics features follow the following name convention:

> `<type>_<year>_<yearly aggregate>_<network variable>_<cluster aggregate>`

- type: kind of network connection measured
    - fixed: connection from fixed sources (landline, fiber, etc.)
    - mobile: connection from mobile devices
- year: Year of source data
- yearly aggregate: How data was aggregated into yearly data
    - Note: Ookla provides data per quarter, so a yearly mean takes the average across 4 quarters
    - For this model, we only aggregate by yearly mean
- network variable: network characteristic described
    - avg_d_kbps: average download speed in kbps
    - avg_u_kbps: average upload speed in kbps
    - avg_lat_ms: average latency in ms
    - num_devices: number of devices measured
- cluster aggregate: how the data was aggregated per cluster aggregate
    - Types: min, mean, max, median, std.
        - For this model: only mean is used 
    - This is calculated using area zonal stats, which weighs the average by the intersection of the Ookla tile with the cluster geometry.
    
    
Ex. `fixed_2019_mean_avg_d_kbps_median` takes the cluster median of 2019 yearly average download speed.


#### Nightlights (VIIRS)
All nightlights features are taken as the zonal aggregate of the raster data per cluster

- ex. `avg_rad_mean`: cluster mean of the average radiance 
- aggregations used: min, mean, max, median


In [25]:
features.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 2024 entries, 0 to 2023
Data columns (total 61 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   poi_count                          2024 non-null   float64
 1   atm_count                          2024 non-null   float64
 2   atm_nearest                        2024 non-null   float64
 3   bank_count                         2024 non-null   float64
 4   bank_nearest                       2024 non-null   float64
 5   bus_station_count                  2024 non-null   float64
 6   bus_station_nearest                2024 non-null   float64
 7   cafe_count                         2024 non-null   float64
 8   cafe_nearest                       2024 non-null   float64
 9   charging_station_count             2024 non-null   float64
 10  charging_station_nearest           2024 non-null   float64
 11  courthouse_count                   2024 non-null

## Model Predictions

### Load Model

In [26]:
with open(MODEL_SAVE_PATH,'rb') as f:
    model = pickle.load(f)

### Make predictions

In [27]:
%%time
country_data['Predicted Wealth Index'] = model.predict(features)

CPU times: user 27.1 ms, sys: 8.37 ms, total: 35.4 ms
Wall time: 34.2 ms




### Save predictions

In [28]:
%%time
country_data.to_file(f'{REGION}_model_rollout.geojson', driver='GeoJSON')

  pd.Int64Index,


CPU times: user 2.77 s, sys: 4.81 ms, total: 2.78 s
Wall time: 2.78 s


### Explore Predictions

In [29]:
country_data.head()

Unnamed: 0,quadkey,shapeName,shapeISO,shapeID,shapeGroup,shapeType,pop_count,geometry,poi_count,atm_count,...,mobile_2019_mean_avg_u_kbps_mean,mobile_2019_mean_avg_lat_ms_mean,mobile_2019_mean_num_tests_mean,mobile_2019_mean_num_devices_mean,avg_rad_min,avg_rad_max,avg_rad_mean,avg_rad_std,avg_rad_median,Predicted Wealth Index
0,31011220203121,Nitibe,,TLS-ADM2-3_0_0-B58,TLS,ADM2,102.251936,"POLYGON ((124.03564 -9.34067, 124.03564 -9.318...",0.0,0.0,...,,,,,0.112404,0.179112,0.151455,0.018153,0.149703,-62247.926385
1,31011220203123,Nitibe,,TLS-ADM2-3_0_0-B58,TLS,ADM2,992.492772,"POLYGON ((124.03564 -9.36235, 124.03564 -9.340...",0.0,0.0,...,,,,,0.145907,0.35022,0.223909,0.051034,0.207525,-62448.930269
2,31011220203130,Nitibe,,TLS-ADM2-3_0_0-B58,TLS,ADM2,118.8976,"POLYGON ((124.05762 -9.34067, 124.05762 -9.318...",0.0,0.0,...,,,,,0.129712,0.187825,0.152679,0.013343,0.151096,-60281.611062
3,31011220203132,Nitibe,,TLS-ADM2-3_0_0-B58,TLS,ADM2,513.637632,"POLYGON ((124.05762 -9.36235, 124.05762 -9.340...",0.0,0.0,...,,,,,0.151849,0.301968,0.195619,0.033749,0.190747,-60576.990254
4,31011220203310,Nitibe,,TLS-ADM2-3_0_0-B58,TLS,ADM2,319.14094,"POLYGON ((124.05762 -9.38403, 124.05762 -9.362...",0.0,0.0,...,,,,,0.131346,0.249318,0.173748,0.027273,0.167033,-67008.037446


In [32]:
country_data[[*input_grid_cols,'Predicted Wealth Index']].explore(column='Predicted Wealth Index')

In [None]:
country_data['total_pop'].corr(country_data['Predicted Wealth Index'])

In [None]:
plt.scatter(country_data['total_pop'],country_data['Predicted Wealth Index'])