# Processing COMTRADE flow data

This script attempts to process the COMTRADEflow data for energy, coal, oil and gas. The goal is to extract the import, export, and flow data as geospatial information: the tools for which are found in the infra_tools library

In [1]:
import sys, os, importlib

import pandas as pd
import geopandas as gpd
import numpy as np

from shapely.geometry import Point, LineString

sys.path.append('../')

from infrasap import process_flows

In [2]:
input_file_energy = "/home/wb411133/data/Projects/INFRA/FLOWS/UN_comtrade_energy_flows.csv"
input_file_coal = "/home/wb411133/data/Projects/INFRA/FLOWS/Coal Ex-Im_UN Comtreade_June 18.csv"
input_file_oil = "/home/wb411133/data/Projects/INFRA/FLOWS/Oil Ex-Im_UN Comtrade_June 18_.csv"
input_file_gas = "/home/wb411133/data/Projects/INFRA/FLOWS/Gas Ex-Im_UN Comtrade_June 18.csv"
out_folder = ""
global_boundaries = "/home/wb411133/data/Projects/INFRA/FLOWS/national_centroids.shp"

inB = gpd.read_file(global_boundaries)
if inB.crs != {'init':'epsg:4326'}:
    inB = inB.to_crs({'init':'epsg:4326'})



In [3]:
file_def = [input_file_oil, "Oil", [8]]
in_file = file_def[0]
commodity = file_def[1]
coal_flows = process_flows.comtrade_flow(in_file, commodity)
coal_flows.initialize(file_def[2], inB)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  country_flows['geometry'] = country_flows.apply(lambda x: generate_line_string(x, line_type), axis=1)


In [None]:
importlib.reload(process_flows)

for file_def in [
                [input_file_energy, "Energy", [3]],
                [input_file_coal, "Coal", [8]],
                [input_file_oil, "Oil", [8]],
                [input_file_gas, "Gas", [8]]
                ]:
    print(file_def[1])
    in_file = file_def[0]
    commodity = file_def[1]
    coal_flows = process_flows.comtrade_flow(in_file, commodity)
    coal_flows.initialize(file_def[2], inB)
    coal_flows.save_simple_layers("/home/wb411133/data/Projects/INFRA/FLOWS/%s/GEOJSON" % commodity, "GEOJSON")

In [None]:
importlib.reload(process_flows)
base_out_folder = "/home/wb411133/data/Projects/INFRA/FLOWS"
data_res = {}
for file_def in [
                 [input_file_energy, "Energy", [3]],
                 [input_file_coal, "Coal", [8]],
                 #[input_file_gas, "Gas", [8,1]],
                 #[input_file_oil, "Oil", [8]]
                ]:
    print("Processing %s" % file_def[1])
    data_flows = process_flows.comtrade_flow(file_def[0], file_def[1])
    data_flows.initialize(file_def[2], inB)
    select_folder = os.path.join(base_out_folder, file_def[1])
    for file_type in ["SHP"]:
        out_folder = os.path.join(select_folder, file_type)
        data_flows.save(out_folder, file_type)    
    data_res[file_def[1]] = data_flows

# Processing airport data from Heinrich

In [11]:
airport_flows = "/home/wb411133/data/Projects/INFRA/FLOWS/Airport_Volume.csv"

inD = pd.read_csv(airport_flows)
#inD.drop(['Country Name', 'Country Name.1'], axis=1, inplace=True)

In [12]:
inD.head()

Unnamed: 0,Orig,Country Name,Name,Dest,Country Name.1,NAme,Year,Country1,Country2,TotalSeats,Airport1Latitude,Airport1Longitude,Airport2Latitude,Airport2Longitude
0,GYD,Azerbaijan,Heydar Aliyev,KBL,Afghanistan,Kabul International,2005,Afghanistan,Azerbaijan,2955.106,40.466599,50.052502,34.5639,69.213898
1,KBL,Afghanistan,Kabul International,GYD,Azerbaijan,Heydar Aliyev,2005,Afghanistan,Azerbaijan,2955.106,34.5639,69.213898,40.466599,50.052502
2,KBL,Afghanistan,Kabul International,URC,China,Diwopu International,2005,Afghanistan,China,597.954,34.5639,69.213898,43.906898,87.474098
3,URC,China,Diwopu International,KBL,Afghanistan,Kabul International,2005,Afghanistan,China,597.954,43.906898,87.474098,34.5639,69.213898
4,DEL,India,Indira Gandhi Intl,KBL,Afghanistan,Kabul International,2005,Afghanistan,India,3457.734,28.573601,77.1008,34.5639,69.213898


In [13]:
# extract airport locations
inD_grouped = inD.groupby(['Orig','Year'])
d = {'Name':'first','TotalSeats':'sum', "Country Name":"first","Airport1Latitude":'first', "Airport1Longitude":'first'}
airport_locations = inD_grouped.agg(d)
airport_locations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,TotalSeats,Country Name,Airport1Latitude,Airport1Longitude
Orig,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAA,2016,Anaa,18670.897,French Polynesia,-17.355101,-145.508499
AAB,2010,Arrabury Airport,8250.032,Australia,-26.700001,141.041702
AAB,2015,Arrabury Airport,736.61,Australia,-26.700001,141.041702
AAB,2016,Arrabury Airport,1308.566,Australia,-26.700001,141.041702
AAC,2006,El Arish International,9974.566,Egypt,31.0769,33.834099


In [None]:
airport_locations = airport_locations.groupby(level=0).last()
airport_locations.head()

In [None]:
airport_locations.to_csv(airport_flows.replace(".csv", "_airport_locations.csv"))

In [None]:
#Calculate out_flows
d = {'TotalSeats':'sum'}
inD_grouped = inD.groupby(['Country1', "Year"])
out_flows = inD_grouped.agg(d).reset_index()

#Calculate in_flows
d = {'TotalSeats':'sum'}
inD_grouped = inD.groupby(['Country2', "Year"])
in_flows = inD_grouped.agg(d)

in_flows = in_flows['TotalSeats'].unstack().reset_index()

In [None]:
def get_data(x):
    x = [y for y in x[2:] if not np.isnan(y)]
    return(x[-1])
in_flows['CURRENT'] = in_flows.apply(get_data, axis=1)
in_flows.shape

# Calculate airport and port flows

In [46]:
port_data = "/home/public/Data/GLOBAL/INFRA/PORTS/Port_flow_data_Q22020.csv"
port_locations = "/home/public/Data/GLOBAL/INFRA/PORTS/attributed_ports.shp"
airport_data = "/home/wb411133/data/Projects/INFRA/FLOWS/Airport_Volume.csv"

inP = pd.read_csv(port_data)
inP_loc = gpd.read_file(port_locations)
inA = pd.read_csv(airport_data)
# For the simplest version, we are only calculating flows for 2017
inA = inA.loc[inA['Year'] == 2019]

In [43]:
agg = {'Name':'first','NAme':'first','Country1':'first','Country2':'first','TotalSeats':'sum',"Country Name":"first","Airport1Latitude":'first', "Airport1Longitude":'first',"Airport2Latitude":'first', "Airport2Longitude":'first'}
inA_g = inA.groupby(['Orig','Dest'])
inA_g = inA_g.agg(agg).reset_index()
geoms = inA_g.apply(lambda x: LineString([Point(x['Airport1Longitude'], x['Airport1Latitude']),
                                         Point(x['Airport2Longitude'], x['Airport2Latitude'])]), axis=1)
inA_g = gpd.GeoDataFrame(inA_g, geometry=geoms, crs={'init':'epsg:4326'})


In [71]:
inA_g.head()

Unnamed: 0,Orig,Dest,Name,NAme,Country1,Country2,TotalSeats,Country Name,Airport1Latitude,Airport1Longitude,Airport2Latitude,Airport2Longitude,geometry
0,AAE,CDG,Rabah Bitat,Charles De Gaulle,Algeria,France,23259.544,Algeria,36.822201,7.8094,49.009701,2.5486,"LINESTRING (7.809400082000001 36.82220078, 2.5..."
1,AAE,IST,Rabah Bitat,Istanbul Airport,Algeria,Turkey,15841.448,Algeria,36.822201,7.8094,40.9767,28.8153,"LINESTRING (7.809400082000001 36.82220078, 28...."
2,AAE,LYS,Rabah Bitat,Satolas,Algeria,France,26933.928,Algeria,36.822201,7.8094,45.725601,5.0817,"LINESTRING (7.809400082000001 36.82220078, 5.0..."
3,AAE,MRS,Rabah Bitat,Marignane,Algeria,France,46813.732,Algeria,36.822201,7.8094,43.4356,5.2164,"LINESTRING (7.809400082000001 36.82220078, 5.2..."
4,AAE,ORY,Rabah Bitat,Orly,Algeria,France,23086.224,Algeria,36.822201,7.8094,48.728901,2.3572,"LINESTRING (7.809400082000001 36.82220078, 2.3..."


In [72]:
inA_g.to_file("/home/wb411133/temp/airport_flows.shp")

In [49]:
# attribute port flows
agg = {'Quarterly deployed capacity (TEU)':'sum'}
inP_g = inP.groupby(["Port1","Port2"])
inP_flows = inP_g.agg(agg).reset_index()

In [73]:
inP_geoms = inP_flows.merge(inP_loc.loc[:,['LOCODE','geometry']], left_on="Port1", right_on="LOCODE")
inP_geoms.columns = ['Port1','Port2','FLOWS','LOCODE_1','PT_1']
inP_geoms = inP_geoms.merge(inP_loc.loc[:,['LOCODE','geometry']], left_on="Port2", right_on="LOCODE")
inP_geoms.columns = ['Port1','Port2','FLOWS','LOCODE_1','PT_1','LOCODE_2','PT_2']
inP_geoms.drop(['LOCODE_1',"LOCODE_2"], axis=1, inplace=True)
inP_geoms.head()

Unnamed: 0,Port1,Port2,FLOWS,PT_1,PT_2
0,AEAJM,AEJEA,40755.0,POINT (55.47878 25.40177),POINT (55.10811 25.00255)
1,AEAUH,AEJEA,13948.29,POINT (54.36666666666667 24.46666666666667),POINT (55.10811 25.00255)
2,AEKHL,AEJEA,1682015.0,POINT (54.66666666666666 24.83333333333333),POINT (55.10811 25.00255)
3,AEKLF,AEJEA,91864.5,POINT (56.35 25.33333333333333),POINT (55.10811 25.00255)
4,AEMKH,AEJEA,145278.2,POINT (55.36666666666667 25.35),POINT (55.10811 25.00255)


In [76]:
sindex = inB.sindex

In [87]:
inB.loc[list(sindex.nearest([inP_geoms['PT_1'].iloc[0].x, inP_geoms['PT_1'].iloc[0].y]))[0]]['ISO3']

'ARE'

In [95]:
# attribute port flows with origin and destination country
#inP_geoms['Country1'] = inP_geoms['PT_1'].apply(lambda x: inB.loc[list(sindex.nearest([x.x, x.y]))[0]]['ISO3'])
#inP_geoms['Country2'] = inP_geoms['PT_2'].apply(lambda x: inB.loc[list(sindex.nearest([x.x, x.y]))[0]]['ISO3'])
inP_geoms['Country1'] = inP_geoms['Port1'].apply(lambda x: x[:2])
inP_geoms['Country2'] = inP_geoms['Port2'].apply(lambda x: x[:2])
inP_geoms.head()

Unnamed: 0,Port1,Port2,FLOWS,PT_1,PT_2,Country1,Country2,geometry
0,AEAJM,AEJEA,40755.0,POINT (55.47878 25.40177),POINT (55.10811 25.00255),AE,AE,"LINESTRING (55.47878 25.40177, 55.10811 25.00255)"
1,AEAUH,AEJEA,13948.29,POINT (54.36666666666667 24.46666666666667),POINT (55.10811 25.00255),AE,AE,LINESTRING (54.36666666666667 24.4666666666666...
2,AEKHL,AEJEA,1682015.0,POINT (54.66666666666666 24.83333333333333),POINT (55.10811 25.00255),AE,AE,LINESTRING (54.66666666666666 24.8333333333333...
3,AEKLF,AEJEA,91864.5,POINT (56.35 25.33333333333333),POINT (55.10811 25.00255),AE,AE,"LINESTRING (56.35 25.33333333333333, 55.10811 ..."
4,AEMKH,AEJEA,145278.2,POINT (55.36666666666667 25.35),POINT (55.10811 25.00255),AE,AE,"LINESTRING (55.36666666666667 25.35, 55.10811 ..."


In [96]:
flow_geoms = inP_geoms.apply(lambda x: LineString([x['PT_1'], x['PT_2']]), axis=1)
inP_geoms = gpd.GeoDataFrame(inP_geoms, geometry=flow_geoms, crs={'init':'epsg:4326'})
inP_geoms.drop(["PT_1","PT_2"], axis=1).to_file("/home/wb411133/temp/port_flows.shp")