# Processing COMTRADE flow data

This script attempts to process the COMTRADEflow data for energy, coal, oil and gas. The goal is to extract the import, export, and flow data as geospatial information: the tools for which are found in the infra_tools library

In [14]:
import sys, os, importlib

import pandas as pd
import geopandas as gpd
import numpy as np

sys.path.append('../')

from infrasap import process_flows

In [15]:
input_file_energy = "/home/wb411133/data/Projects/INFRA/FLOWS/UN_comtrade_energy_flows.csv"
input_file_coal = "/home/wb411133/data/Projects/INFRA/FLOWS/UN_comtrade_coal_flows.csv"
input_file_gas = "/home/wb411133/data/Projects/INFRA/FLOWS/UN_comtrade_gas_flows.csv"
input_file_oil = "/home/wb411133/data/Projects/INFRA/FLOWS/UN_comtrade_oil_flows.csv"
out_folder = ""
global_boundaries = "/home/wb411133/data/Projects/INFRA/FLOWS/national_centroids.shp"

inB = gpd.read_file(global_boundaries)
if inB.crs != {'init':'epsg:4326'}:
    inB = inB.to_crs({'init':'epsg:4326'})



In [48]:
importlib.reload(process_flows)
energy_flows = process_flows.comtrade_flow(input_file_energy, "Energy")
energy_flows.initialize([3], inB)
#energy_flows.save("/home/wb411133/data/Projects/INFRA/FLOWS/ENERGY/SHP", "SHP")
energy_flows.save_simple_layers("/home/wb411133/data/Projects/INFRA/FLOWS/ENERGY/GEOJSON", "GEOJSON")


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  country_flows = inD.loc[inD['Partner ISO'] != "WLD"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  for n_field in ['Trade Value (US$)', 'Qty']:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

In [None]:
importlib.reload(process_flows)
base_out_folder = "/home/wb411133/data/Projects/INFRA/FLOWS"
data_res = {}
for file_def in [
                 [input_file_energy, "Energy", [3]],
                 [input_file_coal, "Coal", [8]],
                 #[input_file_gas, "Gas", [8,1]],
                 #[input_file_oil, "Oil", [8]]
                ]:
    print("Processing %s" % file_def[1])
    data_flows = process_flows.comtrade_flow(file_def[0], file_def[1])
    data_flows.initialize(file_def[2], inB)
    select_folder = os.path.join(base_out_folder, file_def[1])
    for file_type in ["SHP"]:
        out_folder = os.path.join(select_folder, file_type)
        data_flows.save(out_folder, file_type)    
    data_res[file_def[1]] = data_flows

# Processing airport data from Heinrich

In [3]:
airport_flows = "/home/wb411133/data/Projects/INFRA/FLOWS/Airport_Volume.csv"

inD = pd.read_csv(airport_flows)
#inD.drop(['Country Name', 'Country Name.1'], axis=1, inplace=True)

In [13]:
inD.head()

Unnamed: 0,Orig,Country Name,Name,Dest,Country Name.1,NAme,Year,Country1,Country2,TotalSeats,Airport1Latitude,Airport1Longitude,Airport2Latitude,Airport2Longitude
0,GYD,Azerbaijan,Heydar Aliyev,KBL,Afghanistan,Kabul International,2005,Afghanistan,Azerbaijan,2955.106,40.466599,50.052502,34.5639,69.213898
1,KBL,Afghanistan,Kabul International,GYD,Azerbaijan,Heydar Aliyev,2005,Afghanistan,Azerbaijan,2955.106,34.5639,69.213898,40.466599,50.052502
2,KBL,Afghanistan,Kabul International,URC,China,Diwopu International,2005,Afghanistan,China,597.954,34.5639,69.213898,43.906898,87.474098
3,URC,China,Diwopu International,KBL,Afghanistan,Kabul International,2005,Afghanistan,China,597.954,43.906898,87.474098,34.5639,69.213898
4,DEL,India,Indira Gandhi Intl,KBL,Afghanistan,Kabul International,2005,Afghanistan,India,3457.734,28.573601,77.1008,34.5639,69.213898


In [11]:
# extract airport locations
inD_grouped = inD.groupby(['Orig','Year'])
d = {'Name':'first','TotalSeats':'sum', "Country Name":"first","Airport1Latitude":'first', "Airport1Longitude":'first'}
airport_locations = inD_grouped.agg(d)
airport_locations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,TotalSeats,Country Name,Airport1Latitude,Airport1Longitude
Orig,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAA,2016,Anaa,18670.897,French Polynesia,-17.355101,-145.508499
AAB,2010,Arrabury Airport,8250.032,Australia,-26.700001,141.041702
AAB,2015,Arrabury Airport,736.61,Australia,-26.700001,141.041702
AAB,2016,Arrabury Airport,1308.566,Australia,-26.700001,141.041702
AAC,2006,El Arish International,9974.566,Egypt,31.0769,33.834099


In [12]:
airport_locations = airport_locations.groupby(level=0).last()
airport_locations.head()

Unnamed: 0_level_0,Name,TotalSeats,Country Name,Airport1Latitude,Airport1Longitude
Orig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAA,Anaa,18670.897,French Polynesia,-17.355101,-145.508499
AAB,Arrabury Airport,1308.566,Australia,-26.700001,141.041702
AAC,El Arish International,1386.56,Egypt,31.0769,33.834099
AAE,Rabah Bitat,135934.876,Algeria,36.822201,7.8094
AAL,Aalborg,338519.958,Denmark,57.093102,9.85


In [None]:
airport_locations.to_csv(airport_flows.replace(".csv", "_airport_locations.csv"))

In [None]:
#Calculate out_flows
d = {'TotalSeats':'sum'}
inD_grouped = inD.groupby(['Country1', "Year"])
out_flows = inD_grouped.agg(d).reset_index()

#Calculate in_flows
d = {'TotalSeats':'sum'}
inD_grouped = inD.groupby(['Country2', "Year"])
in_flows = inD_grouped.agg(d)

in_flows = in_flows['TotalSeats'].unstack().reset_index()

In [None]:
x = in_flows.iloc[1].values
[y for y in x[2:] if not np.isnan(y)]

In [None]:
def get_data(x):
    x = [y for y in x[2:] if not np.isnan(y)]
    return(x[-1])
in_flows['CURRENT'] = in_flows.apply(get_data, axis=1)
in_flows.shape