Script to download "2016" (really 2018-2019 but using the images determined by the script that was run on the 2016 data) Malawi images

In [1]:
import os 
import pandas as pd
import numpy as np
import glob

In [None]:
!ls output/LSMS/malawi_2016/

In [None]:
locs = sorted(glob.glob('output/LSMS/*/candidate_download_locs.txt'))
locs
foldernames = [loc[12:loc.find('/',12)] for loc in locs]
fls = list(zip(foldernames,locs))
fls

# Create Datasets

Each country in this folder should have:
1. 'nightlights.npy'
2. 'consumptions.npy'
These are aggregated at a cluster level.

This function will add in these values at the cluster level for each image. That is, we now get a dataframe that has cluster nightlight and consumption values for each image.

In [None]:
def create_df(country, df_orig):
    c_nightlight = np.load('output/LSMS/{}/nightlights.npy'.format(country))
    c_consumption = np.load('output/LSMS/{}/consumptions.npy'.format(country))
    c_groups = df_orig.groupby(['clust_lat', 'clust_lon'])
    counts = c_groups.count()
    counts['nightlight'] = c_nightlight
    counts['consumption'] = c_consumption
    counts = counts.reset_index().drop(['im_lat', 'im_lon'], axis=1)
    df_c = pd.merge(left=df_orig, right=counts, on=['clust_lat', 'clust_lon'])
    return df_c
dfs = [create_df(f,pd.read_csv(l, sep=' ', header=None, 
                               names=['im_lat', 'im_lon', 'clust_lat', 'clust_lon'])) for (f,l) in fls]
[df.shape for df in dfs]

In [None]:
# most nightlights are 0
# let's download images that have nonzero nightlights to induce variety into the model
for df in dfs:
    print((df['nightlight'] == 0).mean())
    print(df.shape)

In [None]:
# Drop 10% of 0 nightlight images until  the percentages are less than 10
def drop_0s(df, frac=0.1):
    i = 1
    while (df['nightlight'].values==0).mean() > .1:
        i += 1
#         print(i)
        z_inds = np.argwhere(df['nightlight'].values == 0).reshape(-1)
        drop = np.random.choice(z_inds, int(frac*len(z_inds))).reshape(-1)
        df = df.drop(df.index[drop])
    return df
dfs = [drop_0s(df) for df in dfs]

In [None]:
[(df['nightlight'] < 1).mean() for df in dfs] # still most data is under 1 and small

In [None]:
# The difference this made was only a few images so i didn't do this, quite arbitrary since the number of gridcells with
# nightlights below 1 in non-Malawi is quite low anyway.
def drop_under(df, cutoff=1, frac=0.1):
    z_inds = np.argwhere(df['nightlight'].values <= cutoff).reshape(-1)
    drop = np.random.choice(z_inds, int(frac*len(z_inds))).reshape(-1)
    df = df.drop(df.index[drop])
    return df

In [None]:
df = dfs[4]
from sklearn.mixture import GaussianMixture as GMM
X = df['nightlight'].values.reshape(-1,1)
gmm = GMM(n_components=3).fit(X)
labels = gmm.predict(df['nightlight'].values.reshape(-1,1))

In [None]:
(labels==0).mean(), (labels==1).mean(), (labels==2).mean()

In [None]:
df['nightlight'][labels==0].max(), df['nightlight'][labels==1].max(), df['nightlight'][labels==2].max()

In [None]:
# let's smudge these numbers a little to raise the percentage in class 2
# we need the distribution to be somewhat even
# Mather did this for all, cutoffs seem to work pretty well for all countries.
(df['nightlight'] < 1.5).mean(), \
((df['nightlight'] >= 1.5) & (df['nightlight'] < 11)).mean(), \
(df['nightlight'] >= 11).mean()

In [None]:
d = []
for df in dfs:
    df['nightlight_bin'] = (df['nightlight'] < 1.5)*1 + \
                    ((df['nightlight'] >= 1.5) & (df['nightlight'] < 11))*2 + \
                    (df['nightlight'] >= 11)*3
    d.append(df)
dfs = d

In [None]:
for i in range(len(dfs)):
    dfs[i].to_csv('output/{}_guide.csv'.format(fls[i][0]), index = False)

In [None]:
# we don't need to download repeat images, as that reduces our download size significantly
# some images can belong to 2+ clusters
print(df.drop_duplicates(['im_lat', 'im_lon']).shape)

In [None]:
ll_download = [df.drop_duplicates(['im_lat', 'im_lon']) for df in dfs]
for i in range(len(ll_download)):
    ll_download[i].to_csv('output/{}_download.csv'.format(fls[i][0]), index=False)

In [2]:
# if the script fails halfway through, you can read this and see what is already downloaded
# remove the images already downloaded from this dataframe and then continue downloading

############## THIS DOENS'T WORK AND NOT SURE WHY IT IS MISCOUNTING
# run = 'nigeria_2013'
# download = pd.read_csv('output/{}_download.csv'.format(run))
# downloaded = os.listdir('ims_{}/'.format(run))
# lats = []
# longs = []
# for im in downloaded:
# #     print(im)
#     im = im[:-4].split('_')
#     lats.append(float(im[0]))
#     longs.append(float(im[1]))

# downloaded = pd.DataFrame.from_dict({'im_lat': lats, 'im_lon': longs})

# a = download.set_index(['im_lat', 'im_lon']).index
# b = downloaded.set_index(['im_lat', 'im_lon']).index

# mask = ~a.isin(b)
# download = download.loc[mask].reset_index(drop=True)
# df = download; c = run
# df

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption,nightlight_bin
0,8.616667,2.924999,8.654959,2.969085,0.0,2.036260,1
1,8.675000,2.933333,8.654959,2.969085,0.0,2.036260,1
2,8.666667,2.941666,8.654959,2.969085,0.0,2.036260,1
3,8.650000,2.941666,8.654959,2.969085,0.0,2.036260,1
4,8.633334,2.941666,8.654959,2.969085,0.0,2.036260,1
...,...,...,...,...,...,...,...
16898,13.058334,13.833333,13.093234,13.814570,0.0,2.792612,1
16899,13.083334,13.841666,13.093234,13.814570,0.0,2.792612,1
16900,13.141667,13.849999,13.093234,13.814570,0.0,2.792612,1
16901,13.108334,13.849999,13.093234,13.814570,0.0,2.792612,1


# Download Images

Now we actually download images

In [3]:
"""Interface for downloading aerial imagery from Google Static Maps API.
- Get an API key at https://developers.google.com/maps/documentation/maps-static/intro
"""

import requests
from PIL import Image
from io import BytesIO

class ImageryDownloader:
    def __init__(self, access_token):
        """Initializes the object with an access token"""
        self.access_token = access_token
        self.url = 'https://maps.googleapis.com/maps/api/staticmap?center={},{}&zoom={}&size=400x400&maptype=satellite&key={}'
    
    def download(self, lat, long, zoom):
        """Downloads lat long
        """
        res = requests.get(self.url.format(lat, long, zoom, self.access_token))
        image = Image.open(BytesIO(res.content))

        return image

In [4]:
access = None
with open('api_key.txt', 'r') as f:
    access = f.readlines()[0]
    
im_downloader = ImageryDownloader(access)

In [None]:
for f,l in fls:
    os.makedirs('ims_{}'.format(f), exist_ok=True)
#  Change selection to download
selection =4
c = fls[selection][0]
df = ll_download[selection]
[df.shape for df in dfs],[ll.shape for ll in ll_download]

In [None]:
print(c)
print('gathering images for im_lats and im_lons in this selection')

if True:
    im_names = []
    zoom = 16
    for i, r in df.iterrows():
        lat = r.im_lat
        long = r.im_lon
        try:
            im = im_downloader.download(lat, long, zoom)
            name = str(lat) + '_' + str(long)
            im.save('ims_{}/{}.png'.format(f,name))
            im_names.append(name + '.png')
        except:
            im_names.append(np.nan)
        if i % 100 == 0:
            # the counting is off because the indices from mw_download aren't continuous because we modified the dataframe
            print(i, end=', ')

    df['images'] = im_names
#     df.to_csv('output/{}_download_info.csv'.format(c), index=False)

nigeria_2013
gathering images for im_lats and im_lons in this selection
0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 