<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Prerequisites" data-toc-modified-id="Prerequisites-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Prerequisites</a></span></li><li><span><a href="#Imports-and-Constants" data-toc-modified-id="Imports-and-Constants-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports and Constants</a></span><ul class="toc-item"><li><span><a href="#Prepare-the-dataset" data-toc-modified-id="Prepare-the-dataset-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Prepare the dataset</a></span></li><li><span><a href="#Mask-out-cloud,-snow,-and-cloud-shadow" data-toc-modified-id="Mask-out-cloud,-snow,-and-cloud-shadow-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Mask out cloud, snow, and cloud shadow</a></span></li><li><span><a href="#Multi-yr-composite" data-toc-modified-id="Multi-yr-composite-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Multi-yr composite</a></span></li><li><span><a href="#Add-nightlight" data-toc-modified-id="Add-nightlight-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Add nightlight</a></span></li><li><span><a href="#Add-topography" data-toc-modified-id="Add-topography-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Add topography</a></span></li><li><span><a href="#Export-TF-Records" data-toc-modified-id="Export-TF-Records-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>Export TF Records</a></span></li></ul></li></ul></div>

# Prerequisites

1. Register a Gmail email address at [https://code.earthengine.google.com](https://code.earthengine.google.come). This process may take a couple of days. Without registration, the `ee.Initialize()` command below will throw an error message.
2. Within your conda environment, run `earthengine activate` and follow the prompt. For more instructions, see [https://developers.google.com/earth-engine/python_install-conda.html](https://developers.google.com/earth-engine/python_install-conda.html).

## Instructions

This file must be run twice: once for DHS and once for LSMS.  Adjust the parameters below based on your personal Google Cloud Platform account.  Keep in mind that CSV_INPUT_PATH, GCS_FILE_PREFIX, and IS_DHS will require different values when exporting DHS data than when exporting LSMS data.

## Adjust Parameters

In [1]:
CSV_INPUT_PATH = '../data/download_locations/dhs_clusters.csv'  # The path to the survey CSV, located in this repo
DHS_ASSET_ID = 'users/AnthonyP/sustain_lab/dhs_survey_data'  # The survey ID in your GEE account
GCS_BUCKET = 'sustainlab-common'  # A GCS bucket you own that will house data
GCS_FILE_PREFIX = 'Poverty_tfrecords/' # Will prefix file names, it is recommended that you use different folders for DHS and LSMS.
IS_DHS = True # False for LSMS

In [2]:
if IS_DHS:
    file_suffix = '_dhslocs_'
else:
    file_suffix = '_lsmslocs_'

# Imports and Constants

In [3]:
import ee

import optical_datasources as optx
import imgtools
import ee_tf_exports as tf
from ee_assets import upload_geojson_to_gee, asset_exists

In [4]:
ee.Initialize()

## Prepare the dataset

In [5]:
# This will upload the DHS CSV to your Google Earth Engine account
if not asset_exists(DHS_ASSET_ID):
    upload_geojson_to_gee(CSV_INPUT_PATH, DHS_ASSET_ID)

In [6]:
dhs = ee.FeatureCollection(DHS_ASSET_ID)

In [7]:
dhs_oldest = dhs.filter([ee.Filter.gt('year', 2008), ee.Filter.lte('year', 2011)])  # [2009-2011] inclusive
dhs_middle = dhs.filter([ee.Filter.gt('year', 2011), ee.Filter.lte('year', 2014)])  # [2012-2014] inclusive
dhs_recent = dhs.filter(ee.Filter.gt('year', 2014))  # [2015-onwards]

In [8]:
print('Oldest size:', dhs_oldest.size().getInfo())
print('Middle size:', dhs_middle.size().getInfo())
print('Recent size:', dhs_recent.size().getInfo())

('Oldest size:', 7129)
('Middle size:', 8499)
('Recent size:', 4041)


In [9]:
countries = dhs.distinct('country').aggregate_array('country').getInfo()
display(countries)

[u'mozambique',
 u'lesotho',
 u'uganda',
 u'ethiopia',
 u'senegal',
 u'malawi',
 u'tanzania',
 u'nigeria',
 u'zimbabwe',
 u'burkina_faso',
 u'rwanda',
 u'cameroon',
 u'angola',
 u'cote_d_ivoire',
 u'mali',
 u'benin',
 u'guinea',
 u'zambia',
 u'sierra_leone',
 u'togo',
 u'democratic_republic_of_congo',
 u'kenya',
 u'ghana']

In [10]:
countries = dhs_oldest.distinct('country').aggregate_array('country').getInfo()
for i in countries:
    df = dhs_oldest.filter(ee.Filter.eq('country', i))
    fname = 'lx_median_2009-11_'+i+'_dhslocs'
    print(fname)
    print(df.size().getInfo())

lx_median_2009-11_mozambique_dhslocs
879
lx_median_2009-11_lesotho_dhslocs
395
lx_median_2009-11_uganda_dhslocs
570
lx_median_2009-11_ethiopia_dhslocs
571
lx_median_2009-11_senegal_dhslocs
385
lx_median_2009-11_malawi_dhslocs
827
lx_median_2009-11_tanzania_dhslocs
1031
lx_median_2009-11_nigeria_dhslocs
239
lx_median_2009-11_zimbabwe_dhslocs
393
lx_median_2009-11_burkina_faso_dhslocs
541
lx_median_2009-11_rwanda_dhslocs
492
lx_median_2009-11_cameroon_dhslocs
576
lx_median_2009-11_angola_dhslocs
230


In [11]:
countries = dhs_middle.distinct('country').aggregate_array('country').getInfo()
for i in countries:
    df = dhs_middle.filter(ee.Filter.eq('country', i))
    fname = 'lx_median_2012-14_'+i+'_dhslocs'
    print(fname)
    print(df.size().getInfo())

lx_median_2012-14_senegal_dhslocs
200
lx_median_2012-14_cote_d_ivoire_dhslocs
341
lx_median_2012-14_mali_dhslocs
413
lx_median_2012-14_benin_dhslocs
746
lx_median_2012-14_guinea_dhslocs
300
lx_median_2012-14_malawi_dhslocs
280
lx_median_2012-14_zambia_dhslocs
719
lx_median_2012-14_sierra_leone_dhslocs
435
lx_median_2012-14_nigeria_dhslocs
889
lx_median_2012-14_togo_dhslocs
330
lx_median_2012-14_democratic_republic_of_congo_dhslocs
492
lx_median_2012-14_kenya_dhslocs
1585
lx_median_2012-14_uganda_dhslocs
208
lx_median_2012-14_lesotho_dhslocs
399
lx_median_2012-14_ghana_dhslocs
422
lx_median_2012-14_burkina_faso_dhslocs
248
lx_median_2012-14_rwanda_dhslocs
492


In [12]:
countries = dhs_recent.distinct('country').aggregate_array('country').getInfo()
for i in countries:
    df = dhs_recent.filter(ee.Filter.eq('country', i))
    fname = 'lx_median_2015-17_'+i+'_dhslocs'
    print(fname)
    print(df.size().getInfo())

lx_median_2015-17_kenya_dhslocs
245
lx_median_2015-17_tanzania_dhslocs
608
lx_median_2015-17_nigeria_dhslocs
322
lx_median_2015-17_angola_dhslocs
625
lx_median_2015-17_zimbabwe_dhslocs
400
lx_median_2015-17_malawi_dhslocs
850
lx_median_2015-17_mali_dhslocs
177
lx_median_2015-17_ethiopia_dhslocs
622
lx_median_2015-17_ghana_dhslocs
192


## Mask out cloud, snow, and cloud shadow

In [13]:
def decode_qamask(scene):
    '''
    Pixel QA Bit Flags
    Bit  Attribute
    0    Fill
    1    Clear
    2    Water
    3    Cloud Shadow
    4    Snow
    5    Cloud
    '''
    qa = scene.select('pixel_qa')
    clear = qa.bitwiseAnd(2).neq(0)
    clear = clear.updateMask(clear).rename(['pxqa_clear'])

    water = qa.bitwiseAnd(4).neq(0)
    water = water.updateMask(water).rename(['pxqa_water'])

    cloud_shadow = qa.bitwiseAnd(8).eq(0)
    cloud_shadow = cloud_shadow.updateMask(cloud_shadow).rename(['pxqa_cloudshadow'])

    snow = qa.bitwiseAnd(16).eq(0)
    snow = snow.updateMask(snow).rename(['pxqa_snow'])

    cloud = qa.bitwiseAnd(32).eq(0)
    cloud = cloud.updateMask(cloud).rename(['pxqa_cloud'])

    masks = ee.Image.cat([
        clear, water, cloud_shadow, snow,
        cloud
    ])

        # return scene.select(scene.bandNames().remove('pixel_qa')).addBands(masks)
    return masks

def mask_qaclear(img):

    clear_mask = decode_qamask(img).select('pxqa_clear')
    cloudshadow_mask = decode_qamask(img).select('pxqa_cloudshadow')
    snow_mask = decode_qamask(img).select('pxqa_snow')
    cloud_mask = decode_qamask(img).select('pxqa_cloud')
        
    return img.updateMask(cloudshadow_mask).updateMask(snow_mask).updateMask(cloud_mask).updateMask(snow_mask)

## Multi-yr composite

In [14]:
selbands = ['BLUE', 'GREEN', 'RED', 'NIR', 'SWIR1', 'SWIR2', 'TEMP1'] 

In [15]:
roi_recent = dhs_recent.geometry()
srcoll_recent = optx.LandsatSR(roi_recent, '2015-1-1', '2017-12-31').merged
srcoll_recent = srcoll_recent.map(mask_qaclear)
srmedian_recent = srcoll_recent.select(selbands).median().reproject('EPSG:3857', None, 30)
srmedian_recent = imgtools.add_latlon(srmedian_recent)

In [16]:
roi_middle = dhs_middle.geometry()
srcoll_middle = optx.LandsatSR(roi_middle, '2012-1-1', '2014-12-31').merged
srcoll_middle = srcoll_middle.map(mask_qaclear)
srmedian_middle = srcoll_middle.select(selbands).median().reproject('EPSG:3857', None, 30)
srmedian_middle = imgtools.add_latlon(srmedian_middle)

In [17]:
roi_oldest = dhs_oldest.geometry()
srcoll_oldest = optx.LandsatSR(roi_oldest, '2009-1-1', '2011-12-31').merged
srcoll_oldest = srcoll_oldest.map(mask_qaclear)
srmedian_oldest = srcoll_oldest.select(selbands).median().reproject('EPSG:3857', None, 30)
srmedian_oldest = imgtools.add_latlon(srmedian_oldest)

## Add nightlight

In [18]:
viirs = ee.ImageCollection("NOAA/VIIRS/DNB/MONTHLY_V1/VCMSLCFG")
dmsp = ee.ImageCollection("NOAA/DMSP-OLS/CALIBRATED_LIGHTS_V4")

In [19]:
nlband = ['NIGHTLIGHTS']
viirs_recent = viirs.filterDate('2015-1-1', '2017-12-31').median().select([0],nlband)
viirs_mid = viirs.filterDate('2012-1-1', '2014-12-31').median().select([0],nlband)
dmsp_oldest = dmsp.filterDate('2009-1-1', '2011-12-31').median().select([0],nlband)

In [20]:
srmedian_recent = srmedian_recent.addBands(viirs_recent.reproject('EPSG:3857', None, 30))
srmedian_middle = srmedian_middle.addBands(viirs_mid.reproject('EPSG:3857', None, 30))
srmedian_oldest = srmedian_oldest.addBands(dmsp_oldest.reproject('EPSG:3857', None, 30))

## Add topography

In [21]:
dem = ee.Image("USGS/SRTMGL1_003")

In [22]:
tbands = ['ELEV','SLO', 'ASP']
topogr = ee.Algorithms.Terrain(dem).select(['elevation', 'slope', 'aspect'], tbands)

In [23]:
srmedian_recent = srmedian_recent.addBands(topogr.reproject('EPSG:3857', None, 30))
srmedian_middle = srmedian_middle.addBands(topogr.reproject('EPSG:3857', None, 30))
srmedian_oldest = srmedian_oldest.addBands(topogr.reproject('EPSG:3857', None, 30))

## Export TF Records

In [24]:
dhsinfo = dhs_recent.first().propertyNames().getInfo()

In [25]:
countries = dhs_recent.distinct('country').aggregate_array('country').getInfo()
for i in countries:
    seldhs = dhs_recent.filter(ee.Filter.eq('country', i))
    fname = 'lx_median_2015-17_'+i+file_suffix
    print(fname)
    
    bands = selbands+dhsinfo+['LAT', 'LON']+tbands+nlband
    
    test = tf.get_array_patches(srmedian_recent, 30, 127, seldhs, 
                                   True, True, bands, None, 
                                   GCS_BUCKET,
                                   GCS_FILE_PREFIX, 
                                   fname)

lx_median_2015-17_kenya_dhslocs_
lx_median_2015-17_tanzania_dhslocs_
lx_median_2015-17_nigeria_dhslocs_
lx_median_2015-17_angola_dhslocs_
lx_median_2015-17_zimbabwe_dhslocs_
lx_median_2015-17_malawi_dhslocs_
lx_median_2015-17_mali_dhslocs_
lx_median_2015-17_ethiopia_dhslocs_
lx_median_2015-17_ghana_dhslocs_


In [26]:
countries = dhs_middle.distinct('country').aggregate_array('country').getInfo()
for i in countries:
    seldhs = dhs_middle.filter(ee.Filter.eq('country', i))
    fname = 'lx_median_2012-14_'+i+file_suffix
    print(fname)
    
    bands = selbands+dhsinfo+['LAT', 'LON']+tbands+nlband
    
    test = tf.get_array_patches(srmedian_middle, 30, 127, seldhs, 
                                   True, True, bands, None, 
                                   GCS_BUCKET,
                                   GCS_FILE_PREFIX, 
                                   fname)

lx_median_2012-14_senegal_dhslocs_
lx_median_2012-14_cote_d_ivoire_dhslocs_
lx_median_2012-14_mali_dhslocs_
lx_median_2012-14_benin_dhslocs_
lx_median_2012-14_guinea_dhslocs_
lx_median_2012-14_malawi_dhslocs_
lx_median_2012-14_zambia_dhslocs_
lx_median_2012-14_sierra_leone_dhslocs_
lx_median_2012-14_nigeria_dhslocs_
lx_median_2012-14_togo_dhslocs_
lx_median_2012-14_democratic_republic_of_congo_dhslocs_
lx_median_2012-14_kenya_dhslocs_
lx_median_2012-14_uganda_dhslocs_
lx_median_2012-14_lesotho_dhslocs_
lx_median_2012-14_ghana_dhslocs_
lx_median_2012-14_burkina_faso_dhslocs_
lx_median_2012-14_rwanda_dhslocs_


In [27]:
countries = dhs_oldest.distinct('country').aggregate_array('country').getInfo()
for i in countries:
    seldhs = dhs_oldest.filter(ee.Filter.eq('country', i))
    fname = 'lx_median_2009-11_'+i+file_suffix
    print(fname)
    
    bands = selbands+dhsinfo+['LAT', 'LON']+tbands+nlband
    
    test = tf.get_array_patches(srmedian_oldest, 30, 127, seldhs, 
                                   True, True, bands, None, 
                                   GCS_BUCKET,
                                   GCS_FILE_PREFIX, 
                                   fname)

lx_median_2009-11_mozambique_dhslocs_
lx_median_2009-11_lesotho_dhslocs_
lx_median_2009-11_uganda_dhslocs_
lx_median_2009-11_ethiopia_dhslocs_
lx_median_2009-11_senegal_dhslocs_
lx_median_2009-11_malawi_dhslocs_
lx_median_2009-11_tanzania_dhslocs_
lx_median_2009-11_nigeria_dhslocs_
lx_median_2009-11_zimbabwe_dhslocs_
lx_median_2009-11_burkina_faso_dhslocs_
lx_median_2009-11_rwanda_dhslocs_
lx_median_2009-11_cameroon_dhslocs_
lx_median_2009-11_angola_dhslocs_
