# Chapter 4 - World Development Indicator

**Data Source**
- https://datacatalog.worldbank.org/dataset/world-development-indicators
- https://unstats.un.org/unsd/methodology/m49/overview

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import plotly.io as pio

In [None]:
df = pd.read_csv("../data/WDIData.csv")
df.shape

In [None]:
df.sample(5)

**Drop the last unuseful column**

In [None]:
df.drop(columns=df.columns[-1], inplace=True)
df.sample(1)

**Find out how many unique indicators**

In [None]:
df["Indicator Code"].nunique()

**1,440 indicators are a lot. Save them to a file for reference**

In [None]:
df[["Indicator Name", "Indicator Code"]].drop_duplicates().to_csv("indicators.csv", index=False)

**Eye ball the file and pick the three indicators of interest:**
- SP.DYN.LE00.IN
    - Life expectancy at birth, total (years)
- NY.GDP.PCAP.PP.CD
    - GDP per capita, PPP (current international $)
- SP.POP.TOTL
    - Population, total 

**Filter the data for GDP per capita**

In [None]:
df_gdp = df[df["Indicator Code"] == "NY.GDP.PCAP.PP.CD"]
df_gdp.shape

In [None]:
df_gdp.head(2)

**Melt the dataframe for easy analysis**

In [None]:
df_gdp_melted = df_gdp.melt(id_vars=df_gdp.columns[:4], 
                            value_vars=df_gdp.columns[4:], 
                            var_name="Year", 
                            value_name='GDP Per Capita')
df_gdp_melted.sample(5)

**Drop the unnecessary columns**

In [None]:
df_gdp_melted.drop(columns=["Indicator Name", "Indicator Code"], inplace=True)
df_gdp_melted.sample(2)

**Filter the data for life expectancy**

In [None]:
df_life = df[df["Indicator Code"] == "SP.DYN.LE00.IN"]
df_life.shape

In [None]:
df_life.head(2)

**Melt the dataframe for easy analysis**

In [None]:
df_life_melted = df_life.melt(id_vars=df_life.columns[:4], 
                              value_vars=df_life.columns[4:], 
                              var_name="Year", 
                              value_name='Life Expectancy')
df_life_melted.sample(5)

**Drop the unnecessary columns**

In [None]:
df_life_melted.drop(columns=["Indicator Name", "Indicator Code"], inplace=True)
df_life_melted.sample(2)

**Merge the two data frames**

In [None]:
df_merged = pd.merge(df_gdp_melted, 
                     df_life_melted, 
                     left_on=["Country Name","Country Code","Year"],
                     right_on=["Country Name","Country Code","Year"]
                    )
df_merged.sample(5)

**Filter the data for Population**

In [None]:
df_pop = df[df["Indicator Code"] == "SP.POP.TOTL"]
df_pop.shape

In [None]:
df_pop.sample(2)

**Melt the dataframe for easy analysis**

In [None]:
df_pop_melted = df_pop.melt(id_vars=df_pop.columns[:4], 
                            value_vars=df_pop.columns[4:], 
                            var_name="Year", 
                            value_name='Population')
df_pop_melted.sample(5)

**Drop the unnecessary columns**

In [None]:
df_pop_melted.drop(columns=["Indicator Name", "Indicator Code"], inplace=True)
df_pop_melted.sample(2)

**Merge the two data frames**

In [None]:
df_merged2 = pd.merge(df_merged, 
                      df_pop_melted, 
                      left_on=["Country Name","Country Code","Year"],
                      right_on=["Country Name","Country Code","Year"]
                     )
df_merged2.sample(5)

In [None]:
df_merged3 = df_merged2[df_merged2["Population"].notnull()]

In [None]:
df_merged3.sample(10)

In [None]:
df_merged3.shape

In [None]:
df_merged3[df_merged3["Year"] == "2018"].sample(10)

## Add additional variables for regions and sub-regions

In [None]:
df_country = pd.read_csv("../data/country_regions.csv")
df_country.head()

In [None]:
df_country = df_country[["ISO-alpha3 Code","Region Name", "Sub-region Name", "Country or Area"]]
df_country.sample(5)

In [None]:
df_merged4 = pd.merge(df_merged3, df_country, 
                      left_on=["Country Code"],
                      right_on=["ISO-alpha3 Code"],
                      how="left"
                     )
df_merged4.sample(5)

In [None]:
df_merged4.shape

In [None]:
df_merged4["Region Name"].unique()

In [None]:
df_merged4["Sub-region Name"].unique()

In [None]:
df_merged4[df_merged4["Sub-region Name"].isna()]

In [None]:
df_merged4[df_merged4["Sub-region Name"].isna()]["Country Name"].unique()

In [None]:
df_merged5 = df_merged4[df_merged4["Sub-region Name"].notnull()]
df_merged5.shape

In [None]:
fig = px.scatter(df_merged5[df_merged5["Year"] == "2018"], 
                 x="GDP Per Capita", 
                 y="Life Expectancy",
                 color="Sub-region Name",
                 size="Population",
                 hover_data=["Country Code", "Country Name", "Sub-region Name"]
)

#fig.show()
fig

In [None]:
#fig.write_html("scatter.html")

## A different Approach

In [None]:
indicator_codes = ["SP.DYN.LE00.IN","NY.GDP.PCAP.PP.CD","SP.POP.TOTL"]
df2 = df[df["Indicator Code"].isin(indicator_codes)]
df2.shape

In [None]:
df_list = []
for code in indicator_codes:
    _df = df2[df2["Indicator Code"] == code]
    _df_melted = _df.melt(id_vars=_df.columns[:4], 
                                 value_vars=_df.columns[4:], 
                                 var_name="Year", 
                                 value_name=code)
    _df_melted.drop(columns=["Indicator Name", "Indicator Code"], inplace=True) 
    df_list.append(_df_melted)

df_all = pd.concat(df_list, axis=1)
df_all.shape

In [None]:
df_all.sample(5)

In [None]:
df_all2 = df_all.iloc[:,[0,1,2,3,7,11]]
df_all2.head()

In [None]:
df_all3 = df_all2[df_all2["SP.POP.TOTL"].notnull()]
df_all3.shape

In [None]:
df_all4 = pd.merge(df_all3, df_country, 
                   left_on=["Country Code"],
                   right_on=["ISO-alpha3 Code"],
                   how="left"
                   )

df_all4.sample(5)

In [None]:
df_all4.shape

In [None]:
df_all5 = df_all4[df_all4["Sub-region Name"].notnull()]
df_all5.shape

In [None]:
fig2 = px.scatter(df_all5[df_all5["Year"] == "2018"],
                 x="NY.GDP.PCAP.PP.CD", 
                 y="SP.DYN.LE00.IN",
                 color="Sub-region Name",
                 size="SP.POP.TOTL",
                 hover_data=["Country Code", "Country Name"]
                )

fig2.show()

In [None]:
#fig2.write_html("scatter2.html")

In [None]:
fig = px.choropleth(df_all5[df_all5["Year"] == "2018"],  
                    locations='Country Code', 
                    color='SP.POP.TOTL',
                    color_continuous_scale="Viridis",
                    scope="world",
                    hover_name="Country Name"
#                    locationmode = 'USA-states',
#                    labels={'ST':'State'}
)

#fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
fig.write_html("choropleth.html")