In [148]:
import sys, os, importlib

import pandas as pd
import geopandas as gpd
import numpy as np

from shapely.geometry import Point


sys.path.append('../')
sys.path.append('../../GOST')

from infrasap import process_flows
import GOSTRocks.geocode as geocode

In [100]:
output_folder = 'P:/data/Global/INFRA/PORTS'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
raw_folder = os.path.join(output_folder, "RAW_LOCODES")
if not os.path.exists(raw_folder):
    os.makedirs(raw_folder)

In [2]:
port_data = "J:/Data/GLOBAL/INFRA/PORTS/Port_flow_data_Q22020.csv"
inD = pd.read_csv(port_data)

In [3]:
# Get a list of country codes to extract
inD['ISO2'] = inD['Port1'].apply(lambda x: x[:2])
countries = inD['ISO2'].unique()
countries

array(['AE', 'AG', 'AI', 'AL', 'AN', 'AO', 'AR', 'AS', 'AU', 'AW', 'BB',
       'BD', 'BE', 'BG', 'BH', 'BJ', 'BM', 'BN', 'BQ', 'BR', 'BS', 'BZ',
       'CA', 'CD', 'CG', 'CI', 'CK', 'CL', 'CM', 'CN', 'CO', 'CR', 'CU',
       'CV', 'CW', 'CX', 'CY', 'DE', 'DJ', 'DK', 'DM', 'DO', 'DZ', 'EC',
       'EE', 'EG', 'ER', 'ES', 'FI', 'FJ', 'FM', 'FO', 'FR', 'GA', 'GB',
       'GD', 'GE', 'GF', 'GH', 'GI', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR',
       'GT', 'GU', 'GW', 'GY', 'HK', 'HN', 'HR', 'HT', 'ID', 'IE', 'IL',
       'IN', 'IQ', 'IR', 'IS', 'IT', 'JM', 'JO', 'JP', 'KE', 'KH', 'KI',
       'KM', 'KN', 'KR', 'KW', 'KY', 'LB', 'LC', 'LK', 'LR', 'LT', 'LV',
       'LY', 'MA', 'MD', 'ME', 'MG', 'MH', 'MM', 'MP', 'MQ', 'MR', 'MS',
       'MT', 'MU', 'MV', 'MX', 'MY', 'MZ', 'NA', 'NC', 'NF', 'NG', 'NI',
       'NL', 'NO', 'NR', 'NZ', 'OM', 'PA', 'PE', 'PF', 'PG', 'PH', 'PK',
       'PL', 'PR', 'PT', 'PW', 'PY', 'QA', 'RE', 'RO', 'RU', 'SA', 'SB',
       'SC', 'SD', 'SE', 'SG', 'SI', 'SL', 'SN', 'S

In [74]:
def convert_coords(x):
    coords_split = x.split(" ")
    lat = coords_split[0]
    lon = coords_split[1]
    def get_number(y):        
        num = float(y[:-3]) + ((float(y[-3:-1])/60))
        if (y[-1] == 'W') or (y[-1] == "S"):
            num = num * -1
        return(num)
    return(Point([get_number(lon), get_number(lat)]))

In [None]:
ports = inD['Port1'].unique()

In [202]:
def process_ports(country, country_data):
    if not os.path.exists(country_data):
        # Read data form UN website, save for future processing
        tempD = pd.read_html('https://service.unece.org/trade/locode/%s.htm' % country.lower())
        curD = tempD[2]
        curD.columns = curD.iloc[0]
        curD = curD.drop(0)
        curD.to_csv(country_data)
    else:
        curD = pd.read_csv(country_data, index_col=0)
    curD['LOCODE'] = curD['LOCODE'].apply(lambda x: x.replace(" ",""))
    
    #Drop records with no coordinates
    no_coords_idx = curD['Coordinates'].apply(lambda x: x is np.nan)
    badD = curD.loc[no_coords_idx]
    goodD = curD.loc[~no_coords_idx]
    #if the badD are in the list of port flows, save those
    important_ports = badD.loc[badD['LOCODE'].isin(ports)]

    curD_geom = goodD['Coordinates'].apply(convert_coords)
    curgpd = gpd.GeoDataFrame(goodD, geometry=curD_geom, crs={'init':'epsg:4326'})
    return({'ALL':curD, 'GOOD':curgpd,'BAD':important_ports})

country = "CA"
res = process_ports(country, os.path.join(raw_folder, "%s.csv" % country))
print(res['ALL'].shape)
print(res['GOOD'].shape)
print(res['BAD'].shape)

(3194, 11)
(2422, 12)
(7, 11)


In [203]:
try:
    del(final)
    del(missing_ports)
except:
    pass
for country in countries:
    print(country)
    try:
        res = process_ports(country, os.path.join(raw_folder, "%s.csv" % country))
    except:
        print("ERROR")
    try:
        missing_ports = missing_ports.append(res['BAD'])
    except:
        missing_ports = res['BAD']
    try:
        final = final.append(res['GOOD'])
    except:
        final = res['GOOD']


AE
AG
AI
AL
AN
ERROR
AO
AR
AS
AU
AW
BB
BD
BE
BG
BH
BJ
BM
BN
BQ
BR
BS
BZ
CA
CD
CG
CI
CK
CL
CM
CN
CO
CR
CU
CV
CW
CX
CY
DE
DJ
DK
DM
DO
DZ
EC
EE
EG
ER
ES
FI
FJ
FM
FO
FR
GA
GB
GD
GE
GF
GH
GI
GL
GM
GN
GP
GQ
GR
GT
GU
GW
GY
HK
HN
HR
HT
ID
IE
IL
IN
IQ
IR
IS
IT
JM
JO
JP
KE
KH
KI
KM
KN
KR
KW
KY
LB
LC
LK
LR
LT
LV
LY
MA
MD
ME
MG
MH
MM
MP
ERROR
MQ
MR
MS
MT
MU
MV
MX
MY
MZ
NA
NC
NF
ERROR
NG
NI
NL
NO
NR
NZ
OM
PA
PE
PF
PG
PH
PK
PL
PR
PT
PW
PY
QA
RE
RO
RU
SA
SB
SC
SD
SE
SG
SI
SL
SN
SO
SR
ST
ERROR
SV
SY
TC
TG
TH
TL
TN
TO
TR
TT
TV
ERROR
TW
TZ
UA
US
UY
VC
VE
VG
VI
VN
VU
WF
WS
YE
YT
ZA


In [204]:
#Add country name to missing ports
import pycountry
importlib.reload(geocode)
#pycountry.countries.get(alpha_2='CA')

missing_ports['Country'] = missing_ports['LOCODE'].apply(lambda x: pycountry.countries.get(alpha_2=x[:2]).name)
locations = missing_ports.apply(lambda x: geocode.getLocation("%s, %s" % (x['NameWoDiacritics'], x['Country'])), axis=1)

Mina Saqr, United Arab Emirates was not found
Muara, Brunei Darussalam was not found
Freeport, Grand Bahama, Bahamas was not found
Boma, Congo, The Democratic Republic of the was not found
Matadi, Congo, The Democratic Republic of the was not found
Dalian Zhoushuizi International Apt, China was not found
Fuzhou Changle International Apt, China was not found
Haikou Meilan International Apt, China was not found
Huangpu Pt, China was not found
Jieyang Chaoshan Apt, China was not found
Jingtang Pt, China was not found
Nanjing Pt, China was not found
Ningbo Lishe International Apt, China was not found
Qingdao Liuting International Apt, China was not found
Qinzhou Pt, China was not found
Shanghai Hongqiao International Apt, China was not found
Shenzhen Baoan International Apt, China was not found
Wenzhou Longwan International Apt, China was not found
Xiamen Gaoqi International Apt, China was not found
Yantai Laishan International Apt, China was not found
Zhanjiang Potou Apt, China was not fo

In [205]:
def get_point(x):
    if float(x['location']['lng']) < -180:
        return(np.nan)
    try:
        return(Point(float(x['location']['lng']), float(x['location']['lat'])))
    except:
        return(np.nan)

geoms = locations.apply(get_point)
missing_ports['geometry'] = geoms

still_missing_ports = missing_ports.loc[missing_ports['geometry'].apply(lambda x: type(x) != Point)]
missing_ports = missing_ports.loc[missing_ports['geometry'].apply(lambda x: type(x) == Point)]

In [240]:
still_missing_ports.to_csv(os.path.join(output_folder, "still_missing_ports.csv"))

In [206]:
missing_ports.head()

Unnamed: 0,Ch,LOCODE,Name,NameWoDiacritics,SubDiv,Function,Status,Date,IATA,Coordinates,Remarks,Country,geometry
5,,AEAJM,Ajman,Ajman,,1-3-----,RL,103.0,QAJ,,,United Arab Emirates,POINT (55.47878 25.40177)
33,,AEJEA,Jebel Ali,Jebel Ali,,1-------,QQ,8103.0,,,,United Arab Emirates,POINT (55.10811 25.00255)
1,,AIAXA,Anguilla,Anguilla,,---45---,AI,9601.0,,,,Anguilla,POINT (-63.09375 18.17648)
5,,ALDRZ,Durrës,Durres,,1-------,RL,103.0,,,,Albania,POINT (19.45469 41.32355)
5,,ALDRZ,Durrës,Durres,,1-------,RL,103.0,,,,Albania,POINT (19.45469 41.32355)


In [207]:
final.head()

Unnamed: 0,Ch,LOCODE,Name,NameWoDiacritics,SubDiv,Function,Status,Date,IATA,Coordinates,Remarks,geometry
1,,AEABU,Abu al Bukhoosh,Abu al Bukhoosh,,1-------,RL,307.0,,2529N 05308E,,POINT (53.13333 25.48333)
2,,AEAUH,Abu Dhabi,Abu Dhabi,AZ,1-345---,AI,1101.0,,2428N 05422E,,POINT (54.36667 24.46667)
3,,AEAMU,Abu Musa,Abu Musa,,1-------,RL,201.0,,2552N 05501E,,POINT (55.01667 25.86667)
4,,AEARP,Ahmed Bin Rashid Port,Ahmed Bin Rashid Port,AZ,1-------,RL,1407.0,,2532N 05533E,,POINT (55.55000 25.53333)
7,,AEALB,Al Barsha,Al Barsha,DU,--3-----,RL,1701.0,,2506N 05511E,,POINT (55.18333 25.10000)


In [208]:
final['Country'] = final['LOCODE'].apply(lambda x: pycountry.countries.get(alpha_2=x[:2]).name)

In [209]:
print(final.shape)

(79846, 13)


In [210]:
final = final.append(missing_ports).reset_index()

In [211]:
# Save all data to file
final.to_file(os.path.join(output_folder, "all_ports.shp"))

In [212]:
final.shape

(80210, 14)

In [213]:
ports = inD['Port1'].unique()
inP = final.loc[final['LOCODE'].isin(ports)]
inP.shape

(856, 14)

In [214]:
len(ports)

939

In [216]:
# Join traffic attributes to ports dataset
inP.to_file(os.path.join(output_folder, "selected_ports.shp"))

In [235]:
agg = {"Quarterly deployed capacity (TEU)":"sum"}

inD_agg = inD.groupby(['Port1'])

outflows = inD_agg.agg(agg).reset_index()
outflows.columns = ['Port1', 'outflows']

inflows = inD_agg.agg(agg).reset_index()
inflows.columns = ['Port2', 'inflows']

combo = pd.merge(inP, outflows, left_on='LOCODE', right_on='Port1')
combo = pd.merge(combo, inflows, left_on='LOCODE', right_on='Port2')

combo = combo.drop(['SubDiv','Coordinates','Date','index','Ch','IATA','Remarks','Port1','Port2'], axis=1)

combo.to_file(os.path.join(output_folder, "attributed_ports.shp"))

In [236]:
output_folder

'P:/data/Global/INFRA/PORTS'

In [233]:
inD.head()

Unnamed: 0,Date,Port1,Port2,No of services,Average annual frequency,Average number of ships,Maximum number of ships,Minimum number of ships,No of operators,No of Alliances,...,StDevP ship size (TEU),Variance ship size (TEU),Average ship age (months),Min ship age (months),Max ship age (months),StDevP ship age (months),Variance ship age (months),Annual deployed capacity (TEU),Quarterly deployed capacity (TEU),ISO2
0,2020Q2,AEAJM,AEJEA,3,190.666667,1.666667,2,1,1,0,...,156.664618,24543.80247,375.0,272,413,46.425568,2155.333333,163020.0,40755.0,AE
1,2020Q2,AEAJM,AEMKH,2,208.0,2.0,2,2,1,0,...,185.90858,34562.0,353.5,272,413,55.984373,3134.25,112320.0,28080.0,AE
2,2020Q2,AEAJM,AEQIW,1,208.0,2.0,2,2,1,0,...,1.0,1.0,405.0,397,413,8.0,64.0,18928.0,4732.0,AE
3,2020Q2,AEAUH,AEJEA,2,180.0,5.0,8,2,2,0,...,261.604357,68436.83951,266.777778,139,413,121.814775,14838.83951,55793.14286,13948.28571,AE
4,2020Q2,AEAUH,AEMKH,1,312.0,2.0,2,2,1,0,...,1.0,1.0,402.5,392,413,10.5,110.25,28392.0,7098.0,AE


In [238]:
len(ports)

939

In [239]:
combo.shape

(856, 9)