In [1]:
#all imports
import requests
import pandas as pd
from pathlib import Path
import numpy as np
import os
from DataFunctions import *
import datetime
import wbdata
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
idx = pd.IndexSlice  #magically make MultiIndex slicing better
#e.g. to set value of Aruba->Afghanistan in 1995 to 42, use:
#imported_data.loc[idx[1995,'Aruba'],'Afghanistan'] = 42

In [3]:
#load original data
trade_data=pd.read_csv("raw_data/DataJobID-1257172_1257172_TestQuery.csv" , encoding = "ISO-8859-1")

In [4]:
dic_cols=['ReporterISO3', 'ReporterName']
dic_df=trade_data[dic_cols].drop_duplicates()
country_dic=dic_df.set_index('ReporterName')['ReporterISO3'].to_dict()
inv_country_dic = {v: k for k, v in country_dic.items()}

In [5]:
#create data structures for building multi index dataframe
years = list(range(1995,2016))
countries=list(trade_data['ReporterName'].unique())

In [6]:
def build_multi_index_df(years, countries):
    """
    Function which builds multi index dataframe suitable for this analysis.
    MultiIndex array is three-dimensional: year*country*country
    """ 

    #build MultiIndex array
    rows_array = []
    for year in years:
        for country in countries:
            rows_array.append([year,country])
        
    multi_index = pd.MultiIndex.from_tuples(rows_array, names=['year', 'exporter'])
    data = pd.DataFrame(columns=countries, index=multi_index)
    return data



In [7]:
#create dataframe structure
data = build_multi_index_df(years,countries)

In [8]:
%%timeit -n1 -r1

#fill dataframe with values from data
#Caution, takes roughly 6-8 minutes!
for index, row in trade_data.iterrows():
    for year in years:
        year_key=str(year)+" in 1000 USD "
        data.loc[year][row['ReporterName']][row['PartnerName']]=row[year_key]

1 loop, best of 1: 7min 43s per loop


In [9]:
#fill NaNs
data_filled=data.fillna(0)

In [10]:
#write to TSV
data_filled.to_csv('trade_data.tsv', sep='\t')

In [11]:
#read from TSV
imported_data = pd.read_table('trade_data.tsv', index_col=[0,1])

In [12]:
#compare whether identical
all(imported_data == data_filled)

True

In [13]:
#make copy to work in
percentages = imported_data.copy()

In [14]:
#calculate percentages of exporter's total export to each destination
for year in years:
    this = percentages.loc[year].div(percentages.loc[year].sum(axis=1), axis=0)
    this_filled = this.fillna(0)
    percentages.loc[year].update(this_filled)

In [16]:
#data points per exporter per year, as handy dataframe

data_points = (percentages.loc[1995] != 0).sum(axis=1).to_frame()
data_points.columns = ['1995']

#skip first, as it is used above to create dataframe
#must overwrite column names each loop cycle, as df.assign() interprets column name as literal
i=1
for year in years[1:]:
    i=i+1
    this = (percentages.loc[year] != 0).sum(axis=1)
    data_points = data_points.assign(temp = this)
    data_points.columns = [years[:i]]

In [17]:
data_points

Unnamed: 0_level_0,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
exporter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aruba,38,48,52,49,59,63,68,66,70,72,...,76,76,72,80,78,78,80,77,82,76
Afghanistan,57,63,64,71,68,74,72,85,89,93,...,94,96,97,105,105,97,98,96,91,94
Angola,65,76,75,81,88,100,104,106,108,114,...,111,117,114,118,120,125,119,117,116,111
Anguila,33,35,47,45,47,55,48,50,57,56,...,60,56,61,61,65,60,59,56,53,54
Albania,60,65,67,71,74,77,81,87,86,86,...,96,98,102,103,97,106,98,98,96,97
Andorra,41,43,45,49,47,54,62,66,70,70,...,81,85,76,84,81,82,78,74,81,72
Netherlands Antilles,70,73,77,73,84,92,96,93,94,94,...,105,99,104,105,97,0,0,0,0,0
United Arab Emirates,94,102,110,111,119,134,135,135,143,144,...,147,152,151,148,157,155,151,150,144,140
Argentina,84,95,99,103,105,116,117,113,116,119,...,122,117,123,127,124,125,123,115,116,108
Armenia,49,53,61,63,66,70,74,74,79,83,...,85,86,93,89,96,97,95,94,93,90


In [18]:
indicator_dataframe, indicators, tabnames=GetIndicatorsWB(file='Selected_Indicators.xlsx', sheet='Indicators')

In [19]:
countries1=GetRegionIncomeDataWB()

In [20]:
wbdata = GetDataWB(indicators,2010, 2015)

In [21]:
wb_data_countries = countries1.join(wbdata, how='inner')

In [22]:
region_income_data=FillByRegionAndIncomeWB(wb_data_countries)
region_income_data=FillByRegionWB(region_income_data)

In [23]:
DataCompleteness(region_income_data)

Country Data                                                    100.0
Region                                                          100.0
IncomeGroup                                                     100.0
Exports of goods and services (% of GDP)                        100.0
GDP (current US$)                                               100.0
Total greenhouse gas emissions (kt of CO2 equivalent)           100.0
Exports of goods and services (% of GDP) source                 100.0
GDP (current US$) source                                        100.0
Total greenhouse gas emissions (kt of CO2 equivalent) source    100.0
dtype: float64


In [24]:
dic_cols_wb=countries1['Country Data']
country_dic_wb=dic_cols_wb.to_dict()
inv_country_dic_wb = {v: k for k, v in country_dic_wb.items()}

In [25]:
percentages=pd.DataFrame()
percentage_to_country=imported_data.loc[2014]
percentages=percentages.append(percentage_to_country)

In [26]:
percentages=percentages.fillna(0)

In [27]:
for item in inv_country_dic_wb:
    if item in inv_country_dic:
        continue
    else:
        print(item, inv_country_dic_wb[item])
        
print('---------------------------')
for item in inv_country_dic:
    if item in inv_country_dic_wb:
        continue
    else:
        print(item, inv_country_dic[item])

ASM American Samoa
VGB British Virgin Islands
CYM Cayman Islands
TCD Chad
CHI Channel Islands
COD Congo, Dem. Rep.
CUW Curacao
GNQ Equatorial Guinea
GIB Gibraltar
GUM Guam
IMN Isle of Man
PRK Korea, Dem. People���s Rep.
XKX Kosovo
LBR Liberia
LIE Liechtenstein
MHL Marshall Islands
FSM Micronesia, Fed. Sts.
MCO Monaco
MNE Montenegro
NRU Nauru
MNP Northern Mariana Islands
PRI Puerto Rico
ROU Romania
SMR San Marino
SRB Serbia
SXM Sint Maarten (Dutch part)
SOM Somalia
SSD South Sudan
MAF St. Martin (French part)
TLS Timor-Leste
UZB Uzbekistan
VIR Virgin Islands (U.S.)
---------------------------
AIA Anguila
ANT Netherlands Antilles
BLX Belgium-Luxembourg
COK Cook Islands
EUN European Union
GLP Guadeloupe
GUF French Guiana
MNT Montenegro
MSR Montserrat
MTQ Martinique
MYT Mayotte
OAS Other Asia, nes
REU Reunion
ROM Romania
SER Serbia, FR(Serbia/Montenegro)
SUD Sudan
TMP East Timor


In [28]:
conversion_dic={'SER':'SRB',
               'SUD':'SSD'}

In [29]:
filled_dataframe=percentages.copy(deep=False)
for column in region_income_data:
    column_name=column
    filled_dataframe[column_name]=None
    for index in percentages.index:
        for index2 in region_income_data.index:    
            if index==index2 or country_dic[index]==country_dic_wb[index2]:
                filled_dataframe[column_name][index]=region_income_data[column][index2]
            else:
                try:
                     if conversion_dic[index]==country_dic_wb[index2]:
                            filled_dataframe[column_name][index]=region_income_data[column][index2]
                except KeyError: 
                    continue

In [30]:
filled_dataframe.columns

Index(['Aruba', 'Afghanistan', 'Angola', 'Anguila', 'Albania', 'Andorra',
       'Netherlands Antilles', 'United Arab Emirates', 'Argentina', 'Armenia',
       ...
       'Zimbabwe', 'Country Data', 'Region', 'IncomeGroup',
       'Exports of goods and services (% of GDP)', 'GDP (current US$)',
       'Total greenhouse gas emissions (kt of CO2 equivalent)',
       'Exports of goods and services (% of GDP) source',
       'GDP (current US$) source',
       'Total greenhouse gas emissions (kt of CO2 equivalent) source'],
      dtype='object', length=211)

In [31]:
filled_dataframe=filled_dataframe.dropna(how='any')

In [32]:
len(filled_dataframe)

188

In [33]:
dont_include=["Country Data",
              "Region",
              "IncomeGroup",
              "GDP (current US$)",
              "Total greenhouse gas emissions (kt of CO2 equivalent)",
              "GDP (current US$) source",
              "Total greenhouse gas emissions (kt of CO2 equivalent) source",
             'Exports of goods and services (% of GDP) source',
             'Exports of goods and services (% of GDP)']

In [34]:
export_cols=filled_dataframe.columns[~filled_dataframe.columns.isin(dont_include)]

In [35]:
filled_dataframe['SumOfExports'] = filled_dataframe[export_cols].sum(axis=1)

In [36]:
for column in export_cols:
    colname='Percentage to ' + column
    filled_dataframe[colname]=filled_dataframe[column]/filled_dataframe['SumOfExports']

In [37]:
filled_dataframe['ExportFraction']=filled_dataframe['SumOfExports']/filled_dataframe['GDP (current US$)']

In [38]:
filled_dataframe['EmissionForExport']=filled_dataframe['Total greenhouse gas emissions (kt of CO2 equivalent)']*(filled_dataframe["Exports of goods and services (% of GDP)"]/100)

In [39]:
for column in export_cols:
    colname='Emissions to ' + column
    filled_dataframe[colname]=filled_dataframe['Percentage to ' + column]*filled_dataframe['EmissionForExport']

In [40]:
filled_dataframe

Unnamed: 0_level_0,Aruba,Afghanistan,Angola,Anguila,Albania,Andorra,Netherlands Antilles,United Arab Emirates,Argentina,Armenia,...,Emissions to United States,Emissions to St. Vincent and the Grenadines,Emissions to Venezuela,Emissions to Vietnam,Emissions to Vanuatu,Emissions to Samoa,Emissions to Yemen,Emissions to South Africa,Emissions to Zambia,Emissions to Zimbabwe
exporter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aruba,0.000,0.0,0.000,0.0,0.000,0.504,0.0,0.634,1.403826e+03,0.000,...,6673.62,0,0,0,0,0,0,2.36146,0,0
Afghanistan,0.000,0.0,0.000,0.0,0.209,0.811,0.0,224351.287,1.228533e+03,251.700,...,130.711,0,0,0,0,0,0,0.329852,0,0
Angola,0.000,0.0,0.000,0.0,0.000,0.000,0.0,109504.917,1.269333e+05,13.447,...,756.331,0,0,32.8388,0,0,0,423.674,3.26687,0
Albania,0.000,0.0,0.000,0.0,0.000,4.039,0.0,3754.747,2.448014e+03,0.250,...,14.8095,0,0,0,0,0,0,0.0895904,0,0
Andorra,0.000,0.0,0.000,0.0,0.000,0.000,0.0,59.045,0.000000e+00,4.600,...,120.208,0,0,0.0118511,0,0,0.330012,1.0841,0,0.134671
United Arab Emirates,0.000,0.0,867779.323,0.0,1543.602,122.881,0.0,0.000,2.001343e+05,8934.499,...,13371.6,0,0,3302.24,0,0,16.4152,990.137,164.764,68.5755
Argentina,0.000,0.0,0.000,0.0,63.774,49.995,0.0,8891.343,0.000000e+00,0.300,...,5913.51,0,0,108.042,0,0,0.081739,132.675,0,0.00127167
Armenia,0.000,0.0,0.000,0.0,60.594,0.084,0.0,9377.925,3.685458e+03,0.000,...,59.8096,0,0,23.8915,0,9.67098e-05,0,0.561901,0.000444643,0
Antigua and Barbuda,0.000,0.0,0.000,0.0,0.000,0.000,0.0,30.607,5.488200e+02,0.000,...,28.3397,0,0,0,0,0,0,1.03614,4.55275e-05,0
Australia,0.000,0.0,45435.804,0.0,102.199,96.221,0.0,250076.197,7.088626e+05,151.081,...,15289.5,0,0,2501.58,0,13.7126,0.118488,528.01,218.748,0.0379765


In [41]:
emissions_cols=[col for col in filled_dataframe.columns if 'Emissions to' in col]

In [42]:
emissions_to_countries=pd.DataFrame(filled_dataframe[emissions_cols].sum(axis=0))

In [43]:
emissions_to_countries=emissions_to_countries.reset_index()

In [44]:
emissions_to_countries.replace('Emissions to ','',regex=True,inplace=True)

In [45]:
emissions_to_countries.set_index('index', inplace=True)

In [46]:
emissions_to_countries.index.rename('exporter', inplace=True)

In [47]:
emissions_to_countries

Unnamed: 0_level_0,0
exporter,Unnamed: 1_level_1
Aruba,5.167330e+01
Afghanistan,4.935796e+02
Angola,7.080297e+04
Anguila,0.000000e+00
Albania,3.929060e+03
Andorra,3.516004e+01
Netherlands Antilles,0.000000e+00
United Arab Emirates,5.547633e+04
Argentina,7.973897e+04
Armenia,1.606758e+03


In [48]:
filled_dataframe['EmissionsToCountries']=emissions_to_countries[0]

In [49]:
(filled_dataframe['EmissionsToCountries']/((1-filled_dataframe["Exports of goods and services (% of GDP)"]/100)*filled_dataframe['Total greenhouse gas emissions (kt of CO2 equivalent)'])).mean() 

0.44469341330805551

In [51]:
from geopy.geocoders import Nominatim
geolocator = Nominatim()

In [52]:
def GetCountryCoordinates(country=None):
    '''
    Inputs country. Returns the lat/long coordinates the center of the country.
    '''
    try:
        loc = geolocator.geocode(country)
        return (loc.latitude, loc.longitude)
    except:
        print('No location found for '+country)
        return (0,0)

In [53]:
loc_srb=GetCountryCoordinates('SRB')

In [54]:
filled_dataframe['Latitude']=None
filled_dataframe['Longitude']=None
for country in filled_dataframe.index:
    coords=GetCountryCoordinates(country_dic[country])
    filled_dataframe['Latitude'][country]=coords[0]
    filled_dataframe['Longitude'][country]=coords[1]
    

No location found for AND
No location found for FJI
