In [1]:
import sys, os, importlib, json, multiprocessing
import rasterio, pycountry
import reverse_geocode

import geopandas as gpd
import pandas as pd

from urllib.request import urlopen
from shapely.geometry import Point
from shapely.ops import nearest_points
from shapely import wkt

# Import GOST libraries; sys.path.append will be unnecessary if libraries are already installed
sys.path.append("../../../../gostrocks/src")
sys.path.append("../../../../GOST_Urban")

import GOSTRocks.rasterMisc as rMisc
from GOSTRocks.misc import tPrint
import src.UrbanRaster as urban
%matplotlib inline  

  shapely_geos_version, geos_capi_version_string


In [2]:
out_folder =  "/home/wb411133/temp/ICT_distance"


In [3]:
wbes_coords = "/home/wb411133/data/Projects/ICT_Distance/Data/Masked ES GPS April 15-2021_year.dta"
inD = pd.read_stata(wbes_coords)
print(inD.shape)
# filter data with broken coordinates
inD = inD.loc[(inD['lat_mask'] > -90) & (inD['lat_mask'] < 90) & (inD['lon_mask'] > -180) & (inD['lon_mask'] < 180)]
print(inD.shape)
inD.head()

(121121, 9)
(121116, 9)


Unnamed: 0,idstd,year,fyear,d2_l1_year_perf_indicators,lat_mask,lon_mask,survey,WBCode,type
0,468901.0,2010.0,2009,2009,14.803293,49.687744,Yemen2010,YEM,Enterprise Survey
1,468902.0,2010.0,2009,2009,14.55662,49.129032,Yemen2010,YEM,Enterprise Survey
2,468903.0,2010.0,2009,2009,14.551517,49.141815,Yemen2010,YEM,Enterprise Survey
3,468904.0,2010.0,2009,2009,14.515405,49.125977,Yemen2010,YEM,Enterprise Survey
4,468905.0,2010.0,2009,2009,14.519185,49.119148,Yemen2010,YEM,Enterprise Survey


In [None]:
geoms = [Point(row['lon_mask'], row['lat_mask']) for idx, row in inD.iterrows()]
inD = gpd.GeoDataFrame(inD, geometry=geoms, crs = 'epsg:4326')
inD['d2_l1_year_perf_indicators'] = inD['d2_l1_year_perf_indicators'].astype(int)

In [None]:
global_bounds = "/home/public/Data/GLOBAL/ADMIN/Admin0_Polys.shp"
inG = gpd.read_file(global_bounds)
inG = inG.to_crs('epsg:4326')

# Open and load infrastructure data

In [None]:
openCellID = "/home/public/Data/GLOBAL/INFRA/OPENCELLID/cell_towers_2020-04-15-T000000.csv"    
ixp_file = '/home/public/Data/GLOBAL/INFRA/IXPS/ixps_global_geocoded.csv'
colocation_file = '/home/public/Data/GLOBAL/INFRA/DATA_CENTERS/Data_Centers.geojson'
cell_coverage_folder = '/home/public/Data/GLOBAL/INFRA/GSMA/2019/MCE/Data_MCE/Global'
cell_files = ['MCE_Global2G_2020.tif', 'MCE_Global3G_2020.tif', 'MCE_Global4G_2020.tif']
submarine_cable_file = '/home/public/Data/GLOBAL/INFRA/SUBMARINE_CABLES/landing-point-geo.json'

In [None]:
inCell = pd.read_csv(openCellID)
inD_geom = [Point(x) for x in zip(inCell['lon'], inCell['lat'])]
inCell = gpd.GeoDataFrame(inCell, geometry = inD_geom, crs = {'init':'epsg:4326'})

In [None]:
cell_sindex = inCell.sindex

In [None]:
inCol = gpd.read_file(colocation_file)

In [None]:
# Load submarine cables - load landing points, identify RFS attributes from combined other data
#     final dataset needs landing points with RFS dates
landing_points = gpd.read_file('https://raw.githubusercontent.com/telegeography/www.submarinecablemap.com/master/public/api/v2/landing-point/landing-point-geo.json')
cables = gpd.read_file('https://raw.githubusercontent.com/telegeography/www.submarinecablemap.com/master/public/api/v2/cable/cable-geo.json')
cables_json_base = 'https://raw.githubusercontent.com/telegeography/www.submarinecablemap.com/master/public/api/v2/cable/{cable_id}.json'

In [None]:
all_landing_points = []
for cur_id in cables['slug']:
    response = urlopen(cables_json_base.format(cable_id = cur_id))
    json_data = response.read().decode()
    d = json.loads(json_data)
    l_points = d['landing_points']
    for item in l_points:
        try:
            item['RFS'] = int(d['rfs'][:4])
            all_landing_points.append(item)
        except:
            pass

In [None]:
landing_point_rfs = pd.DataFrame(all_landing_points)
l_geom = landing_point_rfs['latlon'].apply(lambda x: Point(float(x.split(",")[1]), float(x.split(",")[0])))
lp_data = gpd.GeoDataFrame(landing_point_rfs, geometry=l_geom, crs="epsg:4326")

In [None]:
def try_country(x):
    try:
        return(pycountry.countries.search_fuzzy(x.split(",")[-1])[0].alpha_3)
    except:
        return('')

lp_data['ISO3'] = lp_data['name'].apply(lambda x: try_country(x))

In [None]:
inCables = lp_data

In [None]:
inIXP = pd.read_csv(ixp_file, index_col=0)
inD_geom = [Point(x) for x in zip(inIXP['Lon'], inIXP['Lat'])]
inIXP = gpd.GeoDataFrame(inIXP, geometry = inD_geom, crs = {'init':'epsg:4326'})

In [None]:
gsma2g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[0]))
gsma3g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[1]))
gsma4g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[2]))

# Calculate distances

In [None]:
def get_nearest_date(x, selCables):
    selCables = selCables.loc[selCables['RFS'] <= x['d2_l1_year_perf_indicators']]
    xx = selCables.loc[selCables.geometry == nearest_points(x['geometry'], selCables.unary_union)[1]]
    return(xx.sort_values(['RFS'])['RFS'].iloc[0])

def get_nearest(x, selCables):
    selCables = selCables.loc[selCables['RFS'] <= x['d2_l1_year_perf_indicators']]    
    return(x['geometry'].distance(selCables.unary_union))

def calculate_country(curD, curB, curN, out_file, selCol, selIXP, inCables):
    ''' calculaet ICT distances per country
    
    Args:
        curD: geopandas data frame of WBES survey locations
        curB: geopandas data frame of country bounds        
        curN: geopandas data frame of neighbouring countries boundaries
        outFile: string of the path for the output file; is read in if it doesn't exist
        selCol: geopandas data frame of colocation centers 
        selIXP: geopandas data frame of IXPs 
        inCables: geopandas data frame cable landing spots 
    '''
    gsma2g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[0]))
    gsma3g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[1]))
    gsma4g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[2]))
    if False: #os.path.exists(out_file):
        curD = pd.read_csv(out_file, index_col=0)
        curD = pd.merge(distD, curD.loc[:,['idstd','d2_l1_year_perf_indicators']], on='idstd')
        curD_geom = curD['geometry'].apply(wkt.loads)
        distD = gpd.GeoDataFrame(curD, geometry=curD_geom, crs=epsg)
        # Remove columns that need to be re-calculated
        distD = distD.loc[:,[not "ngh" in x for x in distD.columns]]
        distD = distD.loc[:,[not "gsma" in x for x in distD.columns]]
        distD = distD.loc[:,[not "cables_dist" in x for x in distD.columns]]       
    else:
        distD = curD.to_crs(epsg)
        
    total_bound = curB.unary_union
    if curB.shape[0] > 0:
        if not 'col_dist' in distD.columns:
            if selCol.shape[0] > 0:
                selCol = selCol.to_crs(epsg)
                distD['col_dist'] = distD.distance(selCol.unary_union)
            else:
                distD['col_dist'] = -1

        if not "ixp_dist" in distD.columns:
            if selIXP.shape[0] > 0:
                selIXP = selIXP.to_crs(epsg)
                distD['ixp_dist'] = distD.distance(selIXP.unary_union)
            else:
                distD['ixp_dist'] = -1

        if not 'firstCable' in distD.columns:
            selCables = inCables.loc[inCables['ISO3'] == cur_country]         
            if selCables.shape[0] > 0:
                selCables = selCables.to_crs(epsg)
                # Calculate distance and date of first cable landing point
                first_date = selCables['RFS'].sort_values().iloc[0]
                first_points = selCables.loc[selCables['RFS'] == first_date]
                distD['firstCable'] = first_date
                distD['firstCable_dist'] = distD.distance(first_points.unary_union)
                # Calculate distance and date of closest cable landing point
                distD['closestCable'] = distD.apply(lambda x: get_nearest_date(x, selCables), axis=1)
                distD['closestCable_dist'] = distD.apply(lambda x: get_nearest(x, selCables), axis=1)
            else:
                distD['firstCable'] = ''
                distD['firstCable_dist'] = -1
                # Calculate distance and date of closest cable landing point
                distD['closestCable'] = ''
                distD['closestCable_dist'] = -1

        # Calculate distance to nearest neighbouring country
        if not False: #"ngh1_dist" in distD.columns:
            cnt = 1
            for idx, row in curN.iterrows():
                distD['ngh%s' % cnt] = row['ISO3']
                distD['ngh%s_dist' % cnt] = distD.distance(row['geometry'])    
                #Calculate distance to submarine cables
                selCables = inCables.loc[inCables['ISO3'] == row['ISO3']]
                if selCables.shape[0] > 0:
                    distD['ngh%s_cbl_dist' % cnt] = distD.distance(selCables.unary_union)
                    distD['ngh%s_cbl' % cnt] = distD.apply(lambda x: get_nearest_date(x, selCables), axis=1)
                else:
                    distD['ngh%s_cbl_dist' % cnt] = -1
                    distD['ngh%s_cbl' % cnt] = -1
                cnt = cnt +1            
        
        if not 'cell_dist' in distD.columns:
            potential_matches = inCell.loc[list(cell_sindex.intersection(total_bound.bounds))]
            selCell = potential_matches.loc[potential_matches.intersects(total_bound)]
            selCell = selCell.to_crs(epsg)
            distD['cell_dist'] = distD.distance(selCell.unary_union)

        if not "gsma2g" in distD.columns:
            coordsD = distD.to_crs(gsma2g_R.crs)
            coords = [[x.x,x.y] for x in coordsD['geometry']]
            distD['gsma2g'] = [x[0] for x in list(gsma2g_R.sample(coords))]
            distD['gsma3g'] = [x[0] for x in list(gsma3g_R.sample(coords))]
            distD['gsma4g'] = [x[0] for x in list(gsma4g_R.sample(coords))]

        pd.DataFrame(distD).to_csv(out_file)
        return(distD)

In [None]:
broken_countries = ['CHL','RUS','CHN','IDN']
n_processes = 30
epsg = 'epsg:6933'
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
args = []
for cur_country in inD['WBCode'].unique():
    out_file = os.path.join(out_folder, "%s_ICT_distance.csv" % cur_country)
    tPrint(f"***** Processing {cur_country}")
    if not cur_country in broken_countries: # not os.path.exists(out_file) and 
        curD = inD.loc[inD['WBCode'] == cur_country]
        total_bound = curD.unary_union
        curB = inG.loc[inG['ISO3'] == cur_country]
        curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]
        curN = curN.loc[curN['ISO3'] != cur_country]        
        selCol = inCol.loc[inCol['geometry'].apply(lambda x: x.intersects(total_bound))]
        selIXP = inIXP.loc[inIXP['geometry'].apply(lambda x: x.intersects(total_bound))]                
        args.append([curD, curB, curN, out_file, selCol, selIXP, inCables])


In [None]:
out_file

In [None]:
curD

In [None]:
cur_country

In [None]:
num_processes = 40
with multiprocessing.Pool(num_processes) as pool:
    res = pool.starmap(calculate_country, args)

# DEBUGGING

In [22]:
in_file = "/home/wb411133/temp/SOM/SOM_roads.kml"
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
inD = gpd.read_file(in_file, driver="KML")

In [23]:
inD

Unnamed: 0,Name,Description,geometry
0,Mogadishu - Baidoa,,"LINESTRING Z (45.28344 2.04797 0.00000, 45.251..."
1,Baidoa - Luuq,,"LINESTRING Z (43.64767 3.11618 0.00000, 43.631..."
2,Luuq - Dolo,,"LINESTRING Z (42.54447 3.79148 0.00000, 42.521..."
3,Kismayo - El Wak,,"LINESTRING Z (42.55690 -0.37413 0.00000, 42.53..."
4,Liboi Spur,,"LINESTRING Z (40.99050 0.41098 0.00000, 41.067..."
5,Mogadishu - Ferfer,,"LINESTRING Z (45.32951 2.05804 0.00000, 45.398..."
6,Galgogob - Galkayo,,"LINESTRING Z (47.02541 7.02273 0.00000, 47.089..."
7,Galkayo - Garowe,,"LINESTRING Z (47.41693 6.78085 0.00000, 47.430..."
8,Garowe - Bossasso,,"LINESTRING Z (48.48726 8.40750 0.00000, 48.493..."
9,Hargeisa - Ethiopian Border,,"LINESTRING Z (44.06134 9.56109 0.00000, 44.063..."


In [35]:
import fiona
all_layers = []
try:
    del final
except:
    pass
for layer in fiona.listlayers(in_file):
    curD = gpd.read_file(in_file, driver="KML", layer=layer)
    curD['LAYER'] = layer
    print(curD.shape)
    all_layers.append(curD)
    try:
        final = final.append(curD)
    except:
        final = curD

(15, 4)
(2, 4)
(2, 4)
(4, 4)
(3, 4)
(1, 4)
(3, 4)
(4, 4)
(2, 4)
(11, 4)


In [36]:
final.to_file(in_file.replace(".kml", ".geojson"), driver="GeoJSON")