# Processing COMTRADE flow data

This script attempts to process the COMTRADEflow data for energy, coal, oil and gas. The goal is to extract the import, export, and flow data as geospatial information: the tools for which are found in the infra_tools library

In [34]:
import sys, os, importlib

import pandas as pd
import geopandas as gpd
import numpy as np

sys.path.append('../')

from infrasap import process_flows

In [None]:
input_file_energy = "/home/wb411133/data/Projects/INFRA/FLOWS/UN_comtrade_energy_flows.csv"
input_file_coal = "/home/wb411133/data/Projects/INFRA/FLOWS/UN_comtrade_coal_flows.csv"
input_file_gas = "/home/wb411133/data/Projects/INFRA/FLOWS/UN_comtrade_gas_flows.csv"
input_file_oil = "/home/wb411133/data/Projects/INFRA/FLOWS/UN_comtrade_oil_flows.csv"
out_folder = ""
global_boundaries = "/home/wb411133/data/Projects/INFRA/FLOWS/national_centroids.shp"

inB = gpd.read_file(global_boundaries)
if inB.crs != {'init':'epsg:4326'}:
    inB = inB.to_crs({'init':'epsg:4326'})



In [None]:
importlib.reload(process_flows)
energy_flows = process_flows.comtrade_flow(input_file_energy, "Energy")
energy_flows.initialize([3], inB)
energy_flows.save("/home/wb411133/data/Projects/INFRA/FLOWS/ENERGY/CSV", "CSV")
energy_flows.save("/home/wb411133/data/Projects/INFRA/FLOWS/ENERGY/SHP", "SHP")

In [None]:
data_flows.raw_data.columns

In [None]:
data_flows = process_flows.comtrade_flow(file_def[0], file_def[1])
data_flows.initialize(file_def[2], inB)
    

In [None]:
data_res = {}

In [None]:
importlib.reload(process_flows)
base_out_folder = "/home/wb411133/data/Projects/INFRA/FLOWS"

for file_def in [
                 [input_file_energy, "Energy", [3]],
                 [input_file_coal, "Coal", [8]],
                 #[input_file_gas, "Gas", [8,1]],
                 #[input_file_oil, "Oil", [8]]
                ]:
    print("Processing %s" % file_def[1])
    data_flows = process_flows.comtrade_flow(file_def[0], file_def[1])
    data_flows.initialize(file_def[2], inB)
    select_folder = os.path.join(base_out_folder, file_def[1])
    for file_type in ["SHP"]:
        out_folder = os.path.join(select_folder, file_type)
        data_flows.save(out_folder, file_type)    
    data_res[file_def[1]] = data_flows

In [None]:
data_res.keys()

# Processing airport data from Heinrich

In [85]:
airport_flows = "/home/wb411133/data/Projects/INFRA/FLOWS/Airport_Volume.csv"

inD = pd.read_csv(airport_flows)
#inD.drop(['Country Name', 'Country Name.1'], axis=1, inplace=True)
inD.head()

Unnamed: 0,Orig,Country Name,Name,Dest,Country Name.1,NAme,Year,Country1,Country2,TotalSeats,Airport1Latitude,Airport1Longitude,Airport2Latitude,Airport2Longitude
0,GYD,Azerbaijan,Heydar Aliyev,KBL,Afghanistan,Kabul International,2005,Afghanistan,Azerbaijan,2955.106,40.466599,50.052502,34.5639,69.213898
1,KBL,Afghanistan,Kabul International,GYD,Azerbaijan,Heydar Aliyev,2005,Afghanistan,Azerbaijan,2955.106,34.5639,69.213898,40.466599,50.052502
2,KBL,Afghanistan,Kabul International,URC,China,Diwopu International,2005,Afghanistan,China,597.954,34.5639,69.213898,43.906898,87.474098
3,URC,China,Diwopu International,KBL,Afghanistan,Kabul International,2005,Afghanistan,China,597.954,43.906898,87.474098,34.5639,69.213898
4,DEL,India,Indira Gandhi Intl,KBL,Afghanistan,Kabul International,2005,Afghanistan,India,3457.734,28.573601,77.1008,34.5639,69.213898


In [86]:
# extract airport locations
inD_grouped = inD.groupby(['Orig','Year'])
d = {'Name':'first','TotalSeats':'sum', 'NAme':"first","Country Name":"first","Airport1Latitude":'first', "Airport1Longitude":'first'}
airport_locations = inD_grouped.agg(d)
airport_locations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,TotalSeats,NAme,Country Name,Airport1Latitude,Airport1Longitude
Orig,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAA,2016,Anaa,18670.897,Arrabury Airport,French Polynesia,-17.355101,-145.508499
AAB,2010,Arrabury Airport,8250.032,Anaa,Australia,-26.700001,141.041702
AAB,2015,Arrabury Airport,736.61,Anaa,Australia,-26.700001,141.041702
AAB,2016,Arrabury Airport,1308.566,Anaa,Australia,-26.700001,141.041702
AAC,2006,El Arish International,9974.566,Queen Alia Intl,Egypt,31.0769,33.834099


In [87]:
airport_locations = airport_locations.groupby(level=0).last()
airport_locations.head()

Unnamed: 0_level_0,Name,TotalSeats,NAme,Country Name,Airport1Latitude,Airport1Longitude
Orig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAA,Anaa,18670.897,Arrabury Airport,French Polynesia,-17.355101,-145.508499
AAB,Arrabury Airport,1308.566,Anaa,Australia,-26.700001,141.041702
AAC,El Arish International,1386.56,King Abdulaziz Intl,Egypt,31.0769,33.834099
AAE,Rabah Bitat,135934.876,Charles De Gaulle,Algeria,36.822201,7.8094
AAL,Aalborg,338519.958,Burgas,Denmark,57.093102,9.85


In [88]:
airport_locations.to_csv(airport_flows.replace(".csv", "_airport_locations.csv"))

In [37]:
#Calculate out_flows
d = {'TotalSeats':'sum'}
inD_grouped = inD.groupby(['Country1', "Year"])
out_flows = inD_grouped.agg(d).reset_index()

#Calculate in_flows
d = {'TotalSeats':'sum'}
inD_grouped = inD.groupby(['Country2', "Year"])
in_flows = inD_grouped.agg(d)

in_flows = in_flows['TotalSeats'].unstack().reset_index()

In [58]:
x = in_flows.iloc[1].values
[y for y in x[2:] if not np.isnan(y)]

[2339.82]

In [62]:
def get_data(x):
    x = [y for y in x[2:] if not np.isnan(y)]
    return(x[-1])
in_flows['CURRENT'] = in_flows.apply(get_data, axis=1)
in_flows.shape

(213, 17)

# Debuggingbelow


In [None]:
inD = data_res['Energy']
from_pt = inD.complete_data.loc[1,]
from_pt['Reporter_Pt']

In [None]:

def generate_great_circle(from_pt, to_pt, interim_steps=15):
    '''
    '''
    geod = Geodesic.WGS84
    g = geod.Inverse(from_pt.x, from_pt.y, to_pt.x, to_pt.y)
    l = geod.Line(g['lat1'], g['lon1'], g['azi1'])
    num = interim_steps  # 15 intermediate steps
    list_of_points = [from_pt]
    for i in range(num+1):
        pos = l.Position(i * g['s12'] / num)
        list_of_points.append(Point(pos['lat2'], pos['lon2']))
    list_of_points.append(to_pt)
    return(LineString(list_of_points))

generate_great_circle(from_pt['Reporter_Pt'], from_pt['Partner_Pt'])

In [None]:
importlib.reload(process_flows)
file_def = [input_file_energy, "Energy", [3]]
data_flows = process_flows.comtrade_flow(file_def[0], file_def[1])
data_flows.initialize(file_def[2], inB, line_type='great')
data_flows.save("/home/wb411133/data/Projects/INFRA/FLOWS/ENERGY/SHP", "SHP")

In [None]:
data_flows.country_flows.head()

In [None]:
importlib.reload(process_flows)
data_flows.country_flows['geometry'] = data_flows.country_flows.apply(lambda x:process_flows.generate_great_circle(x['Reporter_Pt'], x['Partner_Pt']), axis=1)
data_flows.country_flows.head()

In [None]:
for key, value in data_res.items():
    print(key)
    print(value.raw_data['Commodity'].value_counts())

In [None]:
curD.raw_data['Commodity'].value_counts()

In [None]:
inB['geometry'] = inB['geometry'].apply(lambda x: x.centroid)
inB.to_file("/home/wb411133/data/Projects/INFRA/FLOWS/national_centroids.shp")

In [None]:
#Assign origin and destination centroids
def get_centroid(iso,boundaries):
    try:
        selected_country = boundaries.loc[boundaries['ISO3'] == iso]
        if selected_country.shape[0] == 1:
            return(selected_country.iloc[0]['geometry'].centroid)
        elif selected_country.shape[0] > 1:
            selected_country = selected_country.sort_values('Shape_Area', ascending=False)
            return(selected_country.iloc[0]['geometry'].centroid)
        else:
            return(None)
    except:
        return(None)
#get_centroid('LSO', inB)
inD['Reporter_Pt'] = inD['Reporter ISO'].apply(lambda x: get_centroid(x, inB))
inD['Partner_Pt'] = inD['Partner ISO'].apply(lambda x: get_centroid(x, inB))

In [None]:
selected_D = inD.loc[:,good_columns]
country_flows = selected_D.loc[selected_D['Partner ISO'] != "WLD"]
country_summary = selected_D.loc[selected_D['Partner ISO'] == "WLD"]

In [None]:
country_summary.sort_values(['Reporter ISO'])
country_summary.reset_index(inplace=True)
country_summary.to_csv(os.path.join(out_folder, "Country_Summaries.csv"))

In [None]:
from shapely.geometry import LineString

def generate_line_string(row):
    try:
        if row['Trade Flow'] == "Export":
            return(LineString([row['Reporter_Pt'], row['Partner_Pt']]))
        else:
            return(LineString([row['Partner_Pt'], row['Reporter_Pt']]))
    except:
        print(row)
        return(None)
country_flows['geometry'] = country_flows.apply(lambda x: generate_line_string(x), axis=1)
country_flows.to_csv(os.path.join(out_folder, "Country_Flows.csv"))