<a href="https://colab.research.google.com/github/wcj365/python-stats-dataviz/blob/master/world_development_indicator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 4 - World Development Indicator

**Data Source**
- https://datacatalog.worldbank.org/dataset/world-development-indicators
- https://unstats.un.org/unsd/methodology/m49/overview

In [1]:
!wget http://databank.worldbank.org/data/download/WDI_csv.zip
!unzip WDI_csv.zip

--2021-01-20 00:47:55--  http://databank.worldbank.org/data/download/WDI_csv.zip
Resolving databank.worldbank.org (databank.worldbank.org)... 192.86.98.102
Connecting to databank.worldbank.org (databank.worldbank.org)|192.86.98.102|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://databank.worldbank.org/data/download/WDI_csv.zip [following]
--2021-01-20 00:47:55--  https://databank.worldbank.org/data/download/WDI_csv.zip
Connecting to databank.worldbank.org (databank.worldbank.org)|192.86.98.102|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 66502732 (63M) [application/x-zip-compressed]
Saving to: ‘WDI_csv.zip’


2021-01-20 00:49:04 (935 KB/s) - ‘WDI_csv.zip’ saved [66502732/66502732]



In [3]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import plotly.io as pio

## Step 1 - Read Data

In [4]:
df = pd.read_csv("WDIData.csv")
df.shape

(380160, 66)

In [5]:
df.sample(2)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
165140,Finland,FIN,"Persistence to grade 5, male (% of cohort)",SE.PRM.PRS5.MA.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,97.214256,96.512802,96.849297,99.06778,99.649071,99.541801,99.193916,99.24044,99.558762,99.422432,,99.83477,99.478271,99.641663,98.760811,99.455353,99.770409,99.749153,98.978981,98.880447,99.832283,99.399368,99.717628,99.556862,99.978867,99.415413,99.424698,99.49678,99.72934,99.707291,99.648422,99.841408,,,,
50452,Other small states,OSS,Adults (ages 15+) and children (ages 0-14) new...,SH.HIV.INCD.TL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


**Drop the last unuseful column**

In [6]:
df.drop(columns=df.columns[-1], inplace=True)
df.sample(1)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
298896,Romania,ROU,"Net bilateral aid flows from DAC donors, Austr...",DC.DAC.AUSL.CD,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,50000.000745,280000.001192,230000.004172,19999.999553,159999.996424,150000.00596,,,,,,,,,,,,,,,,,,,,,,,,


## Step 2 - Add additional variables for regions and sub-regions

In [8]:
df_country = pd.read_csv("country_regions.csv")
df_country.head(2)

Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS),Developed / Developing Countries
0,1,World,2.0,Africa,15.0,Northern Africa,,,Algeria,12,DZ,DZA,,,,Developing
1,1,World,2.0,Africa,15.0,Northern Africa,,,Egypt,818,EG,EGY,,,,Developing


In [9]:
df_country = df_country[["ISO-alpha3 Code", "Region Name", "Sub-region Name", "Country or Area"]]
df_country.sample(2)

Unnamed: 0,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area
153,CYP,Asia,Western Asia,Cyprus
110,URY,Americas,Latin America and the Caribbean,Uruguay


In [10]:
df_merged= pd.merge(df_country, 
                    df,
                    right_on=["Country Code"],
                    left_on=["ISO-alpha3 Code"],
                    how="right"
)

df_merged.sample(5)

Unnamed: 0,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
346097,TLS,Asia,South-eastern Asia,Timor-Leste,Timor-Leste,TLS,GNI per capita (constant 2010 US$),NY.GNP.PCAP.KD,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,964.7414,1064.746,1161.626,954.0257,1033.624,1223.886,1699.85,2258.294,3281.474,2554.926,3040.684,3641.175,3201.234,2778.952,2236.931,1802.328,1416.645,1401.233,1364.287,1502.372,
372071,VNM,Asia,South-eastern Asia,Viet Nam,Vietnam,VNM,Gross value added at basic prices (GVA) (const...,NY.GDP.FCST.KD,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,101385600000.0,108318800000.0,115019100000.0,121108700000.0,128052300000.0,136796600000.0,145262800000.0,155247400000.0,166431900000.0,178228800000.0,
334514,SDN,Africa,Northern Africa,Sudan,Sudan,SDN,Firms experiencing losses due to theft and van...,IC.FRM.THEV.ZS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10.9,,,,,,
108532,BRN,Asia,South-eastern Asia,Brunei Darussalam,Brunei Darussalam,BRN,Gross fixed capital formation (constant 2010 US$),NE.GDI.FTOT.KD,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1960074000.0,2079804000.0,2254690000.0,2662310000.0,3240781000.0,2915223000.0,3162755000.0,5889639000.0,3651092000.0,3178898000.0,2546616000.0,1803481000.0,1966491000.0,2816273000.0,2225364000.0,2290768000.0,2308293000.0,2347111000.0,2966075000.0,3359569000.0,3344233000.0,3225523000.0,4429962000.0,5710329000.0,6392739000.0,4390315000.0,4676741000.0,4153953000.0,4490087000.0,5761196000.0,5507711000.0,
279745,NOR,Europe,Northern Europe,Norway,Norway,NOR,"Energy use (kg of oil equivalent) per $1,000 G...",EG.USE.COMM.GD.PP.KD,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,119.9551,120.1749,117.946,122.0749,113.7735,111.3764,102.8398,103.3656,105.2026,108.0093,103.7828,104.4336,95.62805,102.8104,96.72369,95.44251,94.46236,93.12192,108.0978,106.9628,115.0204,94.0094,96.9551,105.5149,91.27006,93.99082,,,,,


## Step 3 - Melt the dataframe for easy analysis

In [11]:
df_melted = df_merged.melt(id_vars=df_merged.columns[:8], 
                          value_vars=df_merged.columns[8:], 
                          var_name="Year", 
                          value_name='Indicator Value')

df_melted.sample(5)

Unnamed: 0,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Indicator Name,Indicator Code,Year,Indicator Value
20836655,SAU,Asia,Western Asia,Saudi Arabia,Saudi Arabia,SAU,"Tariff rate, most favored nation, weighted mea...",TM.TAX.MRCH.WM.FN.ZS,2014,3.84
11320178,PRI,Americas,Latin America and the Caribbean,Puerto Rico,Puerto Rico,PRI,"Educational attainment, at least completed upp...",SE.SEC.CUAT.UP.FE.ZS,1989,
20905570,YEM,Asia,Western Asia,Yemen,"Yemen, Rep.",YEM,Prevalence of anemia among children (% of chil...,SH.ANM.CHLD.ZS,2014,83.7
22870332,,,,,Sub-Saharan Africa,SSF,Coverage of social safety net programs in 3rd ...,per_sa_allsa.cov_q3_tot,2020,
22156384,VGB,Americas,Latin America and the Caribbean,British Virgin Islands,British Virgin Islands,VGB,Gross national expenditure (current LCU),NE.DAB.TOTL.CN,2018,


**Find out how many unique indicators**

In [12]:
df["Indicator Code"].nunique()

1440

**1,440 indicators are a lot. Save them to a file for reference**

In [13]:
df[["Indicator Name", "Indicator Code"]].drop_duplicates().to_csv("indicators.csv", index=False)

**Eye ball the file and pick the three indicators of interest:**
- SP.DYN.LE00.IN
    - Life expectancy at birth, total (years)
- NY.GDP.PCAP.PP.CD
    - GDP per capita, PPP (current international $)
- SP.POP.TOTL
    - Population, total 

In [14]:
indicator_list = ["SP.DYN.LE00.IN","NY.GDP.PCAP.PP.CD","SP.POP.TOTL"]
df_plot = df_melted[df_melted["Indicator Code"].isin(indicator_list)]
df_plot.shape

(48312, 10)

In [15]:
df_plot.sample(5)

Unnamed: 0,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Indicator Name,Indicator Code,Year,Indicator Value
4425597,MLI,Africa,Sub-Saharan Africa,Mali,Mali,MLI,"GDP per capita, PPP (current international $)",NY.GDP.PCAP.PP.CD,1971,
4048540,MHL,Oceania,Micronesia,Marshall Islands,Marshall Islands,MHL,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1970,
21658300,VUT,Oceania,Melanesia,Vanuatu,Vanuatu,VUT,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,2016,70.021
10040380,ERI,Africa,Sub-Saharan Africa,Eritrea,Eritrea,ERI,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1986,48.23
23115943,STP,Africa,Sub-Saharan Africa,Sao Tome and Principe,Sao Tome and Principe,STP,"Population, total",SP.POP.TOTL,2020,


In [16]:
indicator_columns = ("Indicator Name", "Indicator Code", "Indicator Value")
index_columns = [column for column in df_plot.columns if column not in indicator_columns]
index_columns

['ISO-alpha3 Code',
 'Region Name',
 'Sub-region Name',
 'Country or Area',
 'Country Name',
 'Country Code',
 'Year']

In [17]:
df_plot = df_plot.pivot(index=index_columns, columns=["Indicator Code"], values="Indicator Value")
df_plot.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Indicator Code,NY.GDP.PCAP.PP.CD,SP.DYN.LE00.IN,SP.POP.TOTL
ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Year,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ALB,Europe,Southern Europe,Albania,Albania,ALB,2005,5865.324803,75.228,3011487.0
,,,,"Hong Kong SAR, China",HKG,2017,59849.248176,84.680488,7391700.0
BEL,Europe,Western Europe,Belgium,Belgium,BEL,1967,,71.012927,9580991.0
MCO,Europe,Western Europe,Monaco,Monaco,MCO,1968,,,23304.0
KGZ,Asia,Central Asia,Kyrgyzstan,Kyrgyz Republic,KGZ,1965,,58.315268,2573300.0


In [18]:
df_plot.reset_index(inplace=True)
df_plot.sample(5)

Indicator Code,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Year,NY.GDP.PCAP.PP.CD,SP.DYN.LE00.IN,SP.POP.TOTL
7296,GEO,Asia,Western Asia,Georgia,Georgia,GEO,1997,2568.64333,69.613,4349913.0
2023,,,,,Middle East & North Africa (IDA & IBRD countries),TMN,1970,,52.149211,127335091.0
7076,FSM,Oceania,Micronesia,Micronesia (Federated States of),"Micronesia, Fed. Sts.",FSM,1960,,54.513,44514.0
9450,KOR,Asia,Eastern Asia,Republic of Korea,"Korea, Rep.",KOR,2016,39567.016624,82.27561,51217803.0
14323,SYR,Asia,Western Asia,Syrian Arab Republic,Syrian Arab Republic,SYR,2009,,72.938,21205873.0


In [19]:
df_plot = df_plot[df_plot["Sub-region Name"].notnull()]
df_plot = df_plot[df_plot["Region Name"].notnull()]
df_plot = df_plot[df_plot["SP.POP.TOTL"].notnull()]

In [20]:
df_plot.sample(5)

Indicator Code,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Year,NY.GDP.PCAP.PP.CD,SP.DYN.LE00.IN,SP.POP.TOTL
9069,KAZ,Asia,Central Asia,Kazakhstan,Kazakhstan,KAZ,2001,8985.6555,65.768293,14858335.0
3911,BDI,Africa,Sub-Saharan Africa,Burundi,Burundi,BDI,1967,,43.332,3253218.0
4204,BGD,Asia,Southern Asia,Bangladesh,Bangladesh,BGD,2016,3849.111431,71.785,157970840.0
15351,USA,Americas,Northern America,United States of America,United States,USA,2000,36334.908777,76.636585,282162411.0
11352,MYS,Asia,South-eastern Asia,Malaysia,Malaysia,MYS,1966,,62.833,9790084.0


In [21]:
df_plot.query("Year=='1989'").sample(10)

Indicator Code,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Year,NY.GDP.PCAP.PP.CD,SP.DYN.LE00.IN,SP.POP.TOTL
10460,MDG,Africa,Sub-Saharan Africa,Madagascar,Madagascar,MDG,1989,,50.53,11268658.0
13205,SDN,Africa,Northern Africa,Sudan,Sudan,SDN,1989,,55.334,19476647.0
7288,GEO,Asia,Western Asia,Georgia,Georgia,GEO,1989,,70.425,4803300.0
10216,MAF,Americas,Latin America and the Caribbean,Saint Martin (French Part),St. Martin (French part),MAF,1989,,74.321951,28722.0
14303,SYR,Asia,Western Asia,Syrian Arab Republic,Syrian Arab Republic,SYR,1989,,70.207,12080444.0
15462,VCT,Americas,Latin America and the Caribbean,Saint Vincent and the Grenadines,St. Vincent and the Grenadines,VCT,1989,,70.451,107071.0
6007,CYP,Asia,Western Asia,Cyprus,Cyprus,CYP,1989,,76.394,751047.0
3384,AND,Europe,Southern Europe,Andorra,Andorra,AND,1989,,,52448.0
6251,DMA,Americas,Latin America and the Caribbean,Dominica,Dominica,DMA,1989,,,70723.0
8386,IMN,Europe,Northern Europe,Isle of Man,Isle of Man,IMN,1989,,,69267.0


In [22]:
df_scatter = df_plot.query("Year>'1989'").query("Year<'2019'")
df_scatter["Year"].unique()

array(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997',
       '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018'], dtype=object)

In [23]:
df_scatter["NY.GDP.PCAP.PP.CD"].min()

285.586899959856

In [24]:
df_scatter["NY.GDP.PCAP.PP.CD"].max()

141634.96180221302

In [25]:
df_scatter[df_scatter["NY.GDP.PCAP.PP.CD"] > 140000]

Indicator Code,ISO-alpha3 Code,Region Name,Sub-region Name,Country or Area,Country Name,Country Code,Year,NY.GDP.PCAP.PP.CD,SP.DYN.LE00.IN,SP.POP.TOTL
12923,QAT,Asia,Western Asia,Qatar,Qatar,QAT,2012,141634.961802,79.399,2196074.0


In [26]:
df_scatter["SP.DYN.LE00.IN"].min()

26.171999999999997

In [29]:
fig = px.scatter(df_scatter, 
                 x="NY.GDP.PCAP.PP.CD", 
                 y="SP.DYN.LE00.IN",
          #       width=1000, 
         #        height=600,
                 color="Sub-region Name",
         #        color="Region Name",
                 size="SP.POP.TOTL",
          #       log_x=True, 
                 size_max=100,
                 hover_name="Country Name",
                 animation_group="Country Code",
                 animation_frame="Year",
                 range_x=[250,150000], 
                 range_y=[25,100]
              #  hover_data=["Country Code", "Country Name", "Sub-region Name"]
)

fig.write_html("scatter.html")
fig.show()

In [28]:
fig = px.choropleth(df_plot.query("Year=='2018'"),  
                    locations='Country Code', 
                    color='SP.POP.TOTL',
                    color_continuous_scale="Viridis",
                    scope="world",
                    hover_name="Country Name"
#                    locationmode = 'USA-states',
#                    labels={'ST':'State'}
)

#fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.write_html("choropleth.html")
fig.show()