In [1]:
import sys, os, importlib, json, multiprocessing
import rasterio, pycountry
import reverse_geocode

import geopandas as gpd
import pandas as pd

from urllib.request import urlopen
from shapely.geometry import Point
from shapely.ops import nearest_points
from shapely import wkt

#Import local functions
import ict_helper as ict

# Import GOST libraries; sys.path.append will be unnecessary if libraries are already installed
sys.path.append("../../../../gostrocks/src")
sys.path.append("../../../../GOST_Urban/src")

import GOSTRocks.rasterMisc as rMisc
from GOSTRocks.misc import tPrint
import GOST_Urban.UrbanRaster as urban
%matplotlib inline  

In [2]:
out_folder =  "/home/wb411133/temp/ICT_distance"


In [3]:
wbes_coords = "/home/wb411133/data/Projects/ICT_Distance/Data/Masked ES GPS April 15-2021_year.dta"
inD = pd.read_stata(wbes_coords)
print(inD.shape)
# filter data with broken coordinates
inD = inD.loc[(inD['lat_mask'] > -90) & (inD['lat_mask'] < 90) & (inD['lon_mask'] > -180) & (inD['lon_mask'] < 180)]
print(inD.shape)
inD.head()

(121121, 9)
(121116, 9)


Unnamed: 0,idstd,year,fyear,d2_l1_year_perf_indicators,lat_mask,lon_mask,survey,WBCode,type
0,468901.0,2010.0,2009,2009,14.803293,49.687744,Yemen2010,YEM,Enterprise Survey
1,468902.0,2010.0,2009,2009,14.55662,49.129032,Yemen2010,YEM,Enterprise Survey
2,468903.0,2010.0,2009,2009,14.551517,49.141815,Yemen2010,YEM,Enterprise Survey
3,468904.0,2010.0,2009,2009,14.515405,49.125977,Yemen2010,YEM,Enterprise Survey
4,468905.0,2010.0,2009,2009,14.519185,49.119148,Yemen2010,YEM,Enterprise Survey


In [4]:
countries = inD['WBCode'].unique()
for c in countries:
    if not os.path.exists(f'/home/wb411133/temp/ICT_distance/{c}_ICT_distance.csv'):
        print(c)

CHL
RUS
CHN
BIH
XKX
HRV
POL
IDN


In [5]:
geoms = [Point(row['lon_mask'], row['lat_mask']) for idx, row in inD.iterrows()]
inD = gpd.GeoDataFrame(inD, geometry=geoms, crs = 'epsg:4326')
inD['d2_l1_year_perf_indicators'] = inD['d2_l1_year_perf_indicators'].astype(int)

In [6]:
global_bounds = "/home/public/Data/GLOBAL/ADMIN/Admin0_Polys.shp"
inG = gpd.read_file(global_bounds)
inG = inG.to_crs('epsg:4326')
inG.loc[inG['ISO3'] == 'KSV', 'ISO3'] = 'XKX'

# Open and load infrastructure data

In [7]:
openCellID = "/home/public/Data/GLOBAL/INFRA/OPENCELLID/cell_towers_2020-04-15-T000000.csv"    
ixp_file = '/home/public/Data/GLOBAL/INFRA/IXPS/ixps_global_geocoded.csv'
colocation_file = '/home/public/Data/GLOBAL/INFRA/DATA_CENTERS/Data_Centers.geojson'
cell_coverage_folder = '/home/public/Data/GLOBAL/INFRA/GSMA/2019/MCE/Data_MCE/Global'
cell_files = ['MCE_Global2G_2020.tif', 'MCE_Global3G_2020.tif', 'MCE_Global4G_2020.tif']
submarine_cable_file = '/home/public/Data/GLOBAL/INFRA/SUBMARINE_CABLES/landing-point-geo.json'

In [8]:
inCell = pd.read_csv(openCellID)
inD_geom = [Point(x) for x in zip(inCell['lon'], inCell['lat'])]
inCell = gpd.GeoDataFrame(inCell, geometry = inD_geom, crs = {'init':'epsg:4326'})

  return _prepare_from_string(" ".join(pjargs))


In [9]:
cell_sindex = inCell.sindex

In [10]:
inCol = gpd.read_file(colocation_file)

In [11]:
# Load submarine cables - load landing points, identify RFS attributes from combined other data
#     final dataset needs landing points with RFS dates
landing_points = gpd.read_file('https://raw.githubusercontent.com/telegeography/www.submarinecablemap.com/master/web/public/api/v3/landing-point/landing-point-geo.json')                                
cables = gpd.read_file('https://raw.githubusercontent.com/telegeography/www.submarinecablemap.com/master/web/public/api/v3/cable/cable-geo.json')                        
cables_json_base = 'https://raw.githubusercontent.com/telegeography/www.submarinecablemap.com/master/web/public/api/v3/cable/{cable_id}.json'

In [12]:
all_landing_points = []
for cur_id in cables['id']:
    response = urlopen(cables_json_base.format(cable_id = cur_id))
    json_data = response.read().decode()
    d = json.loads(json_data)
    l_points = d['landing_points']
    for item in l_points:
        try:
            item['RFS'] = int(d['rfs'][:4])
            all_landing_points.append(item)
        except:
            pass

In [13]:
landing_point_rfs = pd.DataFrame(all_landing_points)
lp_data = pd.merge(landing_point_rfs, landing_points, on="id")
lp_data = gpd.GeoDataFrame(lp_data, geometry='geometry', crs='epsg:4326')

def try_country(x):
    try:
        return(pycountry.countries.search_fuzzy(x)[0].alpha_3)
    except:
        return('')

lp_data['ISO3'] = lp_data['country'].apply(lambda x: try_country(x))
lp_data.drop(['name_x', 'is_tbd_x'], axis=1, inplace=True)
lp_data.columns = ['id', 'country', 'RFS', 'name', 'is_tbd', 'geometry', 'ISO3']
inCables = lp_data

In [14]:
inIXP = pd.read_csv(ixp_file, index_col=0)
inD_geom = [Point(x) for x in zip(inIXP['Lon'], inIXP['Lat'])]
inIXP = gpd.GeoDataFrame(inIXP, geometry = inD_geom, crs = {'init':'epsg:4326'})

In [15]:
gsma2g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[0]))
gsma3g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[1]))
gsma4g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[2]))

# Calculate distances

In [26]:
broken_countries = ['CHL','RUS','CHN','IDN']
n_processes = 30
epsg = 'epsg:6933'
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
args = []
for cur_country in inD['WBCode'].unique():
    out_file = os.path.join(out_folder, "%s_ICT_distance.csv" % cur_country)
    tPrint(f"***** Processing {cur_country}")
    if not os.path.exists(out_file) and not cur_country in broken_countries: #  
        curD = inD.loc[inD['WBCode'] == cur_country]
        if curD.shape[0] > 0:
            total_bound = inG.loc[inG['ISO3'] == cur_country].unary_union
            curB = inG.loc[inG['ISO3'] == cur_country]
            curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]
            curN = curN.loc[curN['ISO3'] != cur_country]        
            selCol = inCol.loc[inCol['geometry'].apply(lambda x: x.intersects(total_bound))]
            selIXP = inIXP.loc[inIXP['geometry'].apply(lambda x: x.intersects(total_bound))]                
            args.append([cur_country, curD, curB, curN, out_file, selCol, selIXP, inCables, inCell, epsg])


13:27:26	***** Processing YEM
13:27:26	***** Processing BOL
13:27:26	***** Processing ECU
13:27:26	***** Processing PER
13:27:26	***** Processing URY
13:27:26	***** Processing ARG
13:27:26	***** Processing CHL
13:27:26	***** Processing COL
13:27:26	***** Processing PRY
13:27:26	***** Processing VEN
13:27:26	***** Processing PAN
13:27:26	***** Processing NIC
13:27:26	***** Processing HND
13:27:26	***** Processing SLV
13:27:26	***** Processing GTM
13:27:26	***** Processing CRI
13:27:26	***** Processing MEX
13:27:26	***** Processing CAF
13:27:26	***** Processing IRQ
13:27:26	***** Processing ZWE
13:27:26	***** Processing RUS
13:27:26	***** Processing RWA
13:27:26	***** Processing ETH
13:27:26	***** Processing CHN
13:27:26	***** Processing BLR
13:27:26	***** Processing NPL
13:27:26	***** Processing PSE
13:27:26	***** Processing GEO
13:27:26	***** Processing AZE
13:27:26	***** Processing KGZ
13:27:26	***** Processing KAZ
13:27:26	***** Processing BGD
13:27:26	***** Processing ARM
13:27:26	*


  curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]


13:27:26	***** Processing HRV
13:27:26	***** Processing MKD
13:27:26	***** Processing SVN
13:27:26	***** Processing MNE
13:27:26	***** Processing DJI
13:27:26	***** Processing LBN
13:27:26	***** Processing ISR
13:27:26	***** Processing JOR
13:27:26	***** Processing ROU
13:27:26	***** Processing LVA
13:27:26	***** Processing MMR
13:27:26	***** Processing COD
13:27:26	***** Processing TJK
13:27:26	***** Processing EST
13:27:26	***** Processing HUN
13:27:26	***** Processing LTU
13:27:26	***** Processing POL
13:27:26	***** Processing CZE
13:27:26	***** Processing SVK
13:27:26	***** Processing TUR
13:27:26	***** Processing SWE
13:27:26	***** Processing GHA
13:27:26	***** Processing MDG
13:27:26	***** Processing IND
13:27:26	***** Processing TUN
13:27:26	***** Processing EGY
13:27:26	***** Processing SEN
13:27:26	***** Processing MRT
13:27:26	***** Processing BDI
13:27:26	***** Processing SSD
13:27:26	***** Processing NAM
13:27:26	***** Processing SDN
13:27:26	***** Processing PAK
13:27:26	*

In [27]:
print(f"Processing {len(args)} of {len(inD['WBCode'].unique())}")

Processing 1 of 120


In [28]:
for arg in args:
    print(arg[0])

XKX


In [29]:
importlib.reload(ict)
num_processes = min([len(args), 20])
with multiprocessing.Pool(num_processes) as pool:
    res = pool.starmap(ict.calculate_country, args)

  projstring = _prepare_from_string(projparams)


13:38:33	Starting XKX
ALB


In [None]:
broken = []
for arg in args:
    if not arg[0] in broken:
        xx = ict.calculate_country(*args[0], debug=False)

# DEBUGGING

In [None]:
importlib.reload(ict)

xx = ict.calculate_country(*args[0], debug=True)

In [None]:
distD = xx[0]
selCables = xx[1]
curN = xx[2]
distD.apply(lambda x: ict.get_nearest_date(x, selCables), axis=1)