# Modelling with-in cell heterogeneity with satellite imagery

#### Modules

In [None]:
import pandas as pd  # for handling dataframes
import geopandas as gpd  # for handling geospatial dataframes
from geopandas import GeoDataFrame
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt  # for saving plots
import rasterio
import rasterio.mask
from rasterio.windows import Window
import folium
import math
import numpy as np
import itertools
import gdal
import multiprocessing as mp
from rasterstats import point_query
import time
import shapely
import scipy.spatial as spatial
from geovoronoi import voronoi_regions_from_coords
from tqdm import tqdm
import gc
from math import radians, cos, sin, asin, sqrt
import weightedstats as ws
import robustats as rs

#### Paths & Files

In [None]:
path_data = './midsave/'
file_antenna = './input/SITE_ARR_LONLAT_EXACT.csv'
file_commune_map = path_data+'shape_com.shp'
file_guf = './input/senegal.tif'
file_wpg = './input/sen_ppp_2013.tif'
file_cdr = path_data+'NUTS5_tower.csv'
file_bandicoot = path_data+'bandicoot_tower.csv'
file_map_pixel = path_data+'map_pixel.shp'
file_map_grid = path_data+'map_grid.shp'
file_map_pixel_knn = path_data+'w_knn.csv'
file_map_grid_knn = path_data+'w_grid.csv'
file_map_pixel_adm = path_data+'w_knn_adm.csv'
file_map_grid_adm = path_data+'w_grid_adm.csv'
file_map_voronoi = path_data+'map_voronoi.csv'
file_xwalk_points = path_data+'xwalk_points.csv'

#### Notebook options

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Data

#### Map

In [None]:
map_commune = gpd.read_file(file_commune_map).rename(columns={"SP_ID": "MAP_ID", "Shape_Area": "SUPERFICIE", 'CCOD_CRCA' : 'COD_ENTITE'})
map_commune.crs = map_commune.to_crs({'init': 'epsg:4326'})
map_commune.loc[(map_commune.COD_ENTITE == '02120201') & (map_commune.COD_CRCA == '05'), 'COD_ENTITE'] = '02120205'
map_commune.loc[(map_commune.COD_ENTITE == '02220101') & (map_commune.CCOD_CAV == '022202'), 'COD_ENTITE'] = '02220201'

In [None]:
map_commune['MAP_ID'] = map_commune.MAP_ID.astype('uint16')

# Aggregate Mobile Phone Data to commune-level

Load Bandicoots

In [None]:
bandicoot_raw = pd.read_csv(file_bandicoot).rename(columns={"tower": "site_id"})
cdr_raw = pd.read_csv(file_cdr).rename(columns={"tower": "site_id"})

In [None]:
sum_columns = ['site_id', 'calls_ratio', 'sms_ratio', 'vol_ratio']

In [None]:
cdr_mean = cdr_raw[sum_columns].merge(bandicoot_raw, on = 'site_id', how = 'left')

In [None]:
cdr_mean.head(3)

In [None]:
cdr_median = cdr_raw.drop(columns = sum_columns[1:])

In [None]:
cdr_median.head(3)

# Point to Polygon

In [None]:
sim_point_sum = xwalk_points = pd.read_csv(file_xwalk_points)

Take the mean of some features (as done in the paper)

In [None]:
sim_point_mean = sim_point_sum.merge(cdr_mean, on = 'site_id', how = 'left')
sim_point_mean_adm = sim_point_mean.groupby('MAP_ID').mean().reset_index().drop(columns = ['site_id'])

Take the median of the other features (as done in the paper)

In [None]:
sim_point_median = sim_point_sum.merge(cdr_median, on = 'site_id', how = 'left')
sim_point_median_adm = sim_point_median.groupby('MAP_ID').median().reset_index().drop(columns = ['site_id'])

Merge to one df

In [None]:
sim_point_adm = sim_point_mean_adm.merge(sim_point_median_adm, on = 'MAP_ID', how = 'left')

Save it

In [None]:
sim_point_adm.to_csv('./midsave/sim_point_adm.csv', index = False)

# Voronoi (currently we do not assume site-level 'activity' weights)

Load required pre-computed datasets

In [None]:
map_intersection_voronoi = pd.read_csv(file_map_voronoi).astype(
        {'intersection_id': 'uint16', 'site_id' : 'uint16', 'MAP_ID': 'uint16', 'w_geo_adm': 'float64', 'w_geo_ant': 'float64', 'w_guf_adm': 'float64', 'w_guf_ant': 'float64'})

Mean

In [None]:
sim_voronoi_mean = map_intersection_voronoi[['intersection_id', 'site_id', 'MAP_ID', 'w_geo_ant']].merge(cdr_mean, on = 'site_id', how = 'left').dropna()
sim_voronoi_mean_adm = sim_voronoi_mean.groupby('MAP_ID').agg(lambda x: np.average(x, weights=sim_voronoi_mean.loc[x.index, "w_geo_ant"])).reset_index().drop(columns = ['intersection_id', 'site_id', 'w_geo_ant'])

Median

In [None]:
sim_voronoi_median = map_intersection_voronoi[['intersection_id', 'site_id', 'MAP_ID', 'w_geo_ant']].merge(cdr_median, on = 'site_id', how = 'left').dropna()
sim_voronoi_median_adm = sim_voronoi_median.groupby('MAP_ID').agg(lambda x: rs.weighted_median(x, weights=sim_voronoi_median.loc[x.index, "w_geo_ant"])).reset_index().drop(columns = ['intersection_id', 'site_id', 'w_geo_ant'])

Merge

In [None]:
sim_voronoi_adm = sim_voronoi_mean_adm.merge(sim_voronoi_median_adm, on = 'MAP_ID', how = 'left')

Save it

In [None]:
sim_voronoi_adm.to_csv('./midsave/sim_voronoi_adm.csv', index = False)

# Augmented Voronoi (GUF)

Mean

In [None]:
sim_augvoronoi_mean = map_intersection_voronoi[['site_id', 'MAP_ID', 'w_guf_ant']].merge(cdr_mean, on = 'site_id', how = 'left').drop(
    columns = {'site_id'}).dropna()

Filter out those communes without any settlement

In [None]:
zero_weight = sim_augvoronoi_mean.groupby('MAP_ID')['w_guf_ant'].sum().reset_index()

In [None]:
sim_augvoronoi_mean = sim_augvoronoi_mean[~sim_augvoronoi_mean.MAP_ID.isin(zero_weight[zero_weight.w_guf_ant == 0].MAP_ID)]

In [None]:
sim_augvoronoi_mean_adm = sim_augvoronoi_mean.dropna().groupby('MAP_ID').agg(
        lambda x: np.average(x, weights=sim_augvoronoi_mean.loc[x.index, "w_guf_ant"])).reset_index().drop(columns = ['w_guf_ant'])

Median

In [None]:
sim_augvoronoi_median = map_intersection_voronoi[['intersection_id', 'site_id', 'MAP_ID', 'w_guf_ant']].merge(cdr_median, on = 'site_id', how = 'left').dropna()
sim_augvoronoi_median_adm = sim_augvoronoi_median.groupby('MAP_ID').agg(lambda x: rs.weighted_median(x, weights=sim_augvoronoi_median.loc[x.index, "w_guf_ant"])).reset_index().drop(columns = ['intersection_id', 'site_id', 'w_guf_ant'])

Merge

In [None]:
sim_augvoronoi_adm = sim_augvoronoi_mean_adm.merge(sim_augvoronoi_median_adm, on = 'MAP_ID', how = 'left')

Double-checked. Due to bad home allocation.

Save it

In [None]:
sim_augvoronoi_adm.to_csv('./midsave/sim_augvoronoi_adm.csv', index = False)

# Augmented Voronoi (WPG)

Mean

In [None]:
sim_wpgvoronoi_mean = map_intersection_voronoi[['site_id', 'MAP_ID', 'w_wpg_ant']].merge(cdr_mean, on = 'site_id', how = 'left').drop(
    columns = {'site_id'}).dropna()

Filter out those communes without any settlement

In [None]:
zero_weight = sim_wpgvoronoi_mean.groupby('MAP_ID')['w_wpg_ant'].sum().reset_index()

In [None]:
sim_wpgvoronoi_mean = sim_wpgvoronoi_mean[~sim_wpgvoronoi_mean.MAP_ID.isin(zero_weight[zero_weight.w_wpg_ant == 0].MAP_ID)]

In [None]:
sim_wpgvoronoi_mean_adm = sim_wpgvoronoi_mean.dropna().groupby('MAP_ID').agg(
        lambda x: np.average(x, weights=sim_wpgvoronoi_mean.loc[x.index, "w_wpg_ant"])).reset_index().drop(columns = ['w_wpg_ant'])

Median

In [None]:
sim_wpgvoronoi_median = map_intersection_voronoi[['intersection_id', 'site_id', 'MAP_ID', 'w_wpg_ant']].merge(cdr_median, on = 'site_id', how = 'left').dropna()
sim_wpgvoronoi_median_adm = sim_wpgvoronoi_median.groupby('MAP_ID').agg(lambda x: rs.weighted_median(x, weights=sim_wpgvoronoi_median.loc[x.index, "w_wpg_ant"])).reset_index().drop(columns = ['intersection_id', 'site_id', 'w_wpg_ant'])

Merge

In [None]:
sim_wpgvoronoi_adm = sim_wpgvoronoi_mean_adm.merge(sim_wpgvoronoi_median_adm, on = 'MAP_ID', how = 'left')

Save it

In [None]:
sim_wpgvoronoi_adm.to_csv('./midsave/sim_wpgvoronoi_adm.csv', index = False)

Remove to create memory space

In [None]:
map_intersection_voronoi = None

# GUF Overlap

In [None]:
map_pixel_knn = pd.read_csv(file_map_pixel_knn).astype(
        {'pixel_id': 'uint32', 'site_id' : 'uint16', 'path_loss': 'float64', 'w_best_site': 'float64', 'w_knn_site': 'float64'})

In [None]:
map_pixel_adm = pd.read_csv(file_map_pixel_adm)

In [None]:
len(map_pixel_adm.MAP_ID.unique()) # Thietty /Tiety / 257 in Kolda has apparently no settlements

Here, should I incorporate population per pixel to account for varying housing structures (e.g. varying inhabitants per m²)

In [None]:
sim_overlap_sum = map_pixel_knn[['site_id', 'pixel_id', 'w_best_site', 'w_knn_site', 'w_uni_site']].merge(
    map_pixel_adm[['pixel_id', 'MAP_ID']], on = 'pixel_id', how = 'left')

In [None]:
map_pixel_knn = None
map_pixel_adm = None

### BSA

Mean

In [None]:
sim_overlap_mean_best_adm = pd.DataFrame()

In [None]:
for i in tqdm(sim_overlap_sum.MAP_ID.unique()):
    
    sim_overlap_mean = sim_overlap_sum.loc[sim_overlap_sum.MAP_ID == i, ['site_id', 'MAP_ID', 'w_best_site']].merge(cdr_mean, on = 'site_id', how = 'left').drop(
        columns = {'site_id'}).dropna()
    
    if sim_overlap_mean.w_best_site.sum() == 0:
        continue
        
    else:
        df = sim_overlap_mean.groupby('MAP_ID').agg(
            lambda x: np.average(x, weights=sim_overlap_mean.loc[x.index, "w_best_site"])).reset_index().drop(columns = ['w_best_site'])
        
        sim_overlap_mean_best_adm = sim_overlap_mean_best_adm.append(df)

In [None]:
sim_overlap_mean.w_best_site.sum()

Median

In [None]:
sim_overlap_median_best_adm = pd.DataFrame()

In [None]:
for i in tqdm(sim_overlap_sum.MAP_ID.unique()):
    
    sim_overlap_median = sim_overlap_sum.loc[sim_overlap_sum.MAP_ID == i, ['site_id', 'MAP_ID', 'w_best_site']].merge(cdr_median, on = 'site_id', how = 'left').drop(
        columns = {'site_id'}).dropna()
    
    if sim_overlap_median.w_best_site.sum() == 0:
        continue
        
    else:
        df = sim_overlap_median.groupby('MAP_ID').agg(
            lambda x: ws.numpy_weighted_median(x, weights=sim_overlap_median.loc[x.index, "w_best_site"])).reset_index().drop(columns = ['w_best_site'])
        
        sim_overlap_median_best_adm = sim_overlap_median_best_adm.append(df)

Bring together

In [None]:
sim_overlap_best_adm = sim_overlap_mean_best_adm.merge(sim_overlap_median_best_adm, on = 'MAP_ID', how = 'left')

Save it

In [None]:
sim_overlap_best_adm.to_csv('./midsave/sim_overlap_best_adm.csv', index = False)

The differences in counts arise, since some antennas are neither best serving antenna (case for 26 antennas in best server approach) for any settlement nor do they provide RSS values above -110dBm for any settlement (case for 3 antennas in uniform and inverse approach). However, for each of these antennas, home-located SIM cards are registered, which are consequently dropped, hence the mismatch. Main reasons for this mismatch can be threefold: 1) The settlement information extracted from satellite imagery are incomplete, some settlements are left out. 2) The home location algorithm is inaccurate and home-locates to antennas where no settlements are located. 3) Antenna specifications are wrong, i.e. coverage areas / RSS values are over- and underestimated at some locations.

### IDW

Mean

In [None]:
sim_overlap_mean_knn_adm = pd.DataFrame()

In [None]:
for i in tqdm(sim_overlap_sum.MAP_ID.unique()):
    
    sim_overlap_mean = sim_overlap_sum.loc[sim_overlap_sum.MAP_ID == i, ['site_id', 'MAP_ID', 'w_knn_site']].merge(cdr_mean, on = 'site_id', how = 'left').drop(
        columns = {'site_id'}).dropna()
    
    if sim_overlap_mean.w_knn_site.sum() == 0:
        continue
        
    else:
        df = sim_overlap_mean.groupby('MAP_ID').agg(
            lambda x: np.average(x, weights=sim_overlap_mean.loc[x.index, "w_knn_site"])).reset_index().drop(columns = ['w_knn_site'])
        
        sim_overlap_mean_knn_adm = sim_overlap_mean_knn_adm.append(df)

In [None]:
sim_overlap_mean.head(11)

Median

In [None]:
sim_overlap_median_knn_adm = pd.DataFrame()

In [None]:
for i in tqdm(sim_overlap_sum.MAP_ID.unique()):
    
    sim_overlap_median = sim_overlap_sum.loc[sim_overlap_sum.MAP_ID == i, ['site_id', 'MAP_ID', 'w_knn_site']].merge(cdr_median, on = 'site_id', how = 'left').drop(
        columns = {'site_id'}).dropna()
    
    if sim_overlap_median.w_knn_site.sum() == 0:
        continue
        
    else:
        df = sim_overlap_median.groupby('MAP_ID').agg(
            lambda x: ws.numpy_weighted_median(x, weights=sim_overlap_median.loc[x.index, "w_knn_site"])).reset_index().drop(columns = ['w_knn_site'])
        
        sim_overlap_median_knn_adm = sim_overlap_median_knn_adm.append(df)

Bring together

In [None]:
sim_overlap_knn_adm = sim_overlap_mean_knn_adm.merge(sim_overlap_median_knn_adm, on = 'MAP_ID', how = 'left')

Save it

In [None]:
sim_overlap_knn_adm.to_csv('./midsave/sim_overlap_knn_adm.csv', index = False)

# WPG Overlap

In [None]:
map_grid_knn = pd.read_csv(file_map_grid_knn).astype(
        {'grid_id': 'uint32', 'site_id' : 'uint16', 'path_loss': 'float64', 'w_best_site': 'float64', 'w_knn_site': 'float64'})

In [None]:
map_grid_adm = pd.read_csv(file_map_grid_adm)

In [None]:
len(map_grid_adm.MAP_ID.unique())

In [None]:
map_grid_knn = None
map_grid_adm = None

### BSA

Mean

In [None]:
sim_wpg_overlap_mean_best_adm = pd.DataFrame()

In [None]:
for i in tqdm(sim_wpg_overlap_sum.MAP_ID.unique()):
    
    sim_wpg_overlap_mean = sim_wpg_overlap_sum.loc[sim_wpg_overlap_sum.MAP_ID == i, ['site_id', 'MAP_ID', 'w_best_site']].merge(cdr_mean, on = 'site_id', how = 'left').drop(
        columns = {'site_id'}).dropna()
    
    if sim_wpg_overlap_mean.w_best_site.sum() == 0:
        continue
        
    else:
        df = sim_wpg_overlap_mean.groupby('MAP_ID').agg(
            lambda x: np.average(x, weights=sim_wpg_overlap_mean.loc[x.index, "w_best_site"])).reset_index().drop(columns = ['w_best_site'])
        
        sim_wpg_overlap_mean_best_adm = sim_wpg_overlap_mean_best_adm.append(df)

In [None]:
sim_wpg_overlap_mean.w_best_site.sum()

Median

In [None]:
sim_wpg_overlap_median_best_adm = pd.DataFrame()

In [None]:
for i in tqdm(sim_wpg_overlap_sum.MAP_ID.unique()):
    
    sim_wpg_overlap_median = sim_wpg_overlap_sum.loc[sim_wpg_overlap_sum.MAP_ID == i, ['site_id', 'MAP_ID', 'w_best_site']].merge(cdr_median, on = 'site_id', how = 'left').drop(
        columns = {'site_id'}).dropna()
    
    if sim_wpg_overlap_median.w_best_site.sum() == 0:
        continue
        
    else:
        df = sim_wpg_overlap_median.groupby('MAP_ID').agg(
            lambda x: ws.numpy_weighted_median(x, weights=sim_wpg_overlap_median.loc[x.index, "w_best_site"])).reset_index().drop(columns = ['w_best_site'])
        
        sim_wpg_overlap_median_best_adm = sim_wpg_overlap_median_best_adm.append(df)

Bring it together

In [None]:
sim_wpg_overlap_best_adm = sim_wpg_overlap_mean_best_adm.merge(sim_wpg_overlap_median_best_adm, on = 'MAP_ID', how = 'left')

Save it

In [None]:
sim_wpg_overlap_best_adm.to_csv('./midsave/sim_wpg_overlap_best_adm.csv', index = False)

### IDW

Mean

In [None]:
sim_wpg_overlap_mean_knn_adm = pd.DataFrame()

In [None]:
for i in tqdm(sim_wpg_overlap_sum.MAP_ID.unique()):
    
    sim_wpg_overlap_mean = sim_wpg_overlap_sum.loc[sim_wpg_overlap_sum.MAP_ID == i, ['site_id', 'MAP_ID', 'w_knn_site']].merge(cdr_mean, on = 'site_id', how = 'left').drop(
        columns = {'site_id'}).dropna()
    
    if sim_wpg_overlap_mean.w_knn_site.sum() == 0:
        continue
        
    else:
        df = sim_wpg_overlap_mean.groupby('MAP_ID').agg(
            lambda x: np.average(x, weights=sim_wpg_overlap_mean.loc[x.index, "w_knn_site"])).reset_index().drop(columns = ['w_knn_site'])
        
        sim_wpg_overlap_mean_knn_adm = sim_wpg_overlap_mean_knn_adm.append(df)

Median

In [None]:
sim_wpg_overlap_median_knn_adm = pd.DataFrame()

In [None]:
for i in tqdm(sim_wpg_overlap_sum.MAP_ID.unique()):
    
    sim_wpg_overlap_median = sim_wpg_overlap_sum.loc[sim_wpg_overlap_sum.MAP_ID == i, ['site_id', 'MAP_ID', 'w_knn_site']].merge(cdr_median, on = 'site_id', how = 'left').drop(
        columns = {'site_id'}).dropna()
    
    if sim_wpg_overlap_median.w_knn_site.sum() == 0:
        continue
        
    else:
        df = sim_wpg_overlap_median.groupby('MAP_ID').agg(
            lambda x: ws.numpy_weighted_median(x, weights=sim_wpg_overlap_median.loc[x.index, "w_knn_site"])).reset_index().drop(columns = ['w_knn_site'])
        
        sim_wpg_overlap_median_knn_adm = sim_wpg_overlap_median_knn_adm.append(df)

Bring together

In [None]:
sim_wpg_overlap_knn_adm = sim_wpg_overlap_mean_knn_adm.merge(sim_wpg_overlap_median_knn_adm, on = 'MAP_ID', how = 'left')

Save it

In [None]:
sim_wpg_overlap_knn_adm.to_csv('./midsave/sim_wpg_overlap_knn_adm.csv', index = False)