In [1]:
import numpy as np
import pandas as pd

# Population Data Cleaning

In [2]:
# import data
pop_total = pd.read_csv('data-sources/population_total.csv')

In [3]:
def remove_years(df, start_year=2000):
    # remove all year columns from df that are before start_year
    
    df_copy = df.copy()
    
    # collect column headers to be removed
    cols = []
    for col in df_copy:
        if (col.isdigit() and int(col) < start_year):
            cols.append(col)

    # drop columns
    df_copy = df_copy.drop(columns=cols, axis=1)
    
    return df_copy
    
pop_total_short = remove_years(pop_total, start_year=2000)

In [4]:
# remove columns otherwise not required
pop_cols = ['Indicator Name', '2018', 'Indicator Code', 'Country Code']
pop_total_short = pop_total_short.drop(columns=pop_cols, axis=1)

In [5]:
# transpose dataframe
pop_total_short = pop_total_short.transpose()

# change column headers to country names & drop duplicate column
pop_total_short.columns = list(pop_total_short.loc['Country Name' , : ])
pop_total_short = pop_total_short.drop(['Country Name'])

In [6]:
# remove unions
unions = [
    "Central Europe and the Baltics", "Caribbean small states", "East Asia & Pacific (excluding high income)", 
    "Early-demographic dividend", "East Asia & Pacific", "Europe & Central Asia (excluding high income)", 
    "Europe & Central Asia", "Euro area", "European Union", "Fragile and conflict affected situations",
    "High income", "Heavily indebted poor countries (HIPC)", "IBRD only", "IDA & IBRD total", "IDA total",
    "IDA blend", "IDA only", "Latin America & Caribbean", "Not classified" ,"Latin America & Caribbean (excluding high income)", 
    "Least developed countries: UN classification", "Low income", "Lower middle income", "Low & middle income", 
    "Late-demographic dividend", "Middle East & North Africa", "Middle income", 
    "Middle East & North Africa (excluding high income)", "OECD members", "Other small states", 
    "Pre-demographic dividend", "Pacific island small states", "Post-demographic dividend", 
    "Sub-Saharan Africa (excluding high income)", "Sub-Saharan Africa", "Small states", 
    "East Asia & Pacific (IDA & IBRD countries)", "Europe & Central Asia (IDA & IBRD countries)", 
    "Latin America & the Caribbean (IDA & IBRD countries)", "Middle East & North Africa (IDA & IBRD countries)",
    "South Asia (IDA & IBRD)", "Sub-Saharan Africa (IDA & IBRD countries)", "South Asia", "Upper middle income", 
    "World", "Arab World", "North America"
]
pop_total_short = pop_total_short.drop(columns=unions, axis=1)

# remove countries with less than 10M inhabitants in 2000
small_countries = []
for col in pop_total_short:
    if pop_total_short[col]['2000'] <= 10000000:
        small_countries.append(col)

pop_total_short = pop_total_short.drop(columns=small_countries, axis=1)

# drop columns with more than two missing values
pop_total_short = pop_total_short.dropna(axis='columns', thresh=2)

# order by max population in descending order
pop_total_short = pop_total_short.sort_values(by='2017', axis=1, ascending=False)

In [7]:
# calculate population percentage change
pop_perc = pop_total_short.copy()

for col in pop_total_short:
    base_val = pop_total_short[col].iloc[0]
    pop_perc[col] = pop_perc[col]/base_val - 1
    
pop_perc.head()

Unnamed: 0,China,India,United States,Indonesia,Brazil,Pakistan,Nigeria,Bangladesh,Russian Federation,Mexico,...,Guatemala,Ecuador,Zimbabwe,Cambodia,Cuba,Belgium,Greece,Czech Republic,Portugal,Hungary
2000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001,0.00729025,0.0174986,0.00994656,0.0140213,0.0140517,0.0222212,0.0254301,0.0191966,-0.00423253,0.0132462,...,0.0235352,0.0177501,0.0117748,0.0205819,0.00321109,0.00344543,0.00521238,-0.00375015,0.00707723,-0.00229116
2002,0.0140618,0.0349045,0.0193604,0.0282104,0.0277455,0.0442587,0.0516109,0.038147,-0.00880315,0.0259137,...,0.047903,0.0351159,0.0227678,0.039694,0.00603207,0.00795366,0.00890392,-0.00567008,0.0126078,-0.00512811
2003,0.0203977,0.0522073,0.0281594,0.0425677,0.0410443,0.0662713,0.0786299,0.056526,-0.0132902,0.038545,...,0.072934,0.0523419,0.03368,0.0576654,0.0084433,0.0121822,0.0113145,-0.00595462,0.0164164,-0.00797368
2004,0.0264762,0.069403,0.037719,0.0570776,0.0539164,0.0884834,0.106591,0.0739182,-0.0172549,0.0518672,...,0.0983784,0.0697663,0.0454303,0.0749668,0.0104213,0.0165723,0.0138197,-0.00565204,0.0188498,-0.010168


In [8]:
# adjust index column
pop_perc = pop_perc.reset_index()
pop_perc = pop_perc.rename(columns={'index': 'Year'})

In [9]:
# add columns for bottom/top lines with distance representing total population
min_population = min(list(pop_total_short.min()))
max_population = float(max(list(pop_total_short.max())))

# scale linearly based on max. thickness
max_thickness = 0.10
min_thickness = 0.01

for col in pop_total_short:
    thickness = pd.to_numeric(pop_total_short[col])/max_population * max_thickness
    #thickness = np.log10(pd.to_numeric(pop_total_short[col]))/np.log10(max_population) * max_thickness
    
    thickness.index = range(18)
    
    # lower limit
    pop_perc[col + '_low'] = pop_perc[col] - thickness/2
    
    # upper limit
    pop_perc[col + '_up'] = pop_perc[col] + thickness/2
    
# thickness scale for d3 legend creation
legend_dims = [ n/max_population * max_thickness for n in [1000000000, 500000000, 10000000]]

print(legend_dims)

[0.07212951575849595, 0.036064757879247976, 0.0007212951575849597]


In [10]:
# store data
pop_perc.to_csv('pop_total.csv', index=False)

# Metadata Cleaning

In [11]:
# import data
metadata = pd.read_csv('data-sources/country_metadata.csv')

In [12]:
# remove columns otherwise not required
cols = ['Country Code', 'SpecialNotes']
metadata = metadata.drop(columns=cols, axis=1)

# rename "TableName" column
metadata.columns.values[-1] = 'Country'

# remove all rows with missing valuesp
print(metadata.shape)
metadata = metadata.dropna()

print(metadata.shape)

(263, 3)
(217, 3)


In [13]:
metadata.head()

Unnamed: 0,Region,IncomeGroup,Country
0,Latin America & Caribbean,High income,Aruba
1,South Asia,Low income,Afghanistan
2,Sub-Saharan Africa,Lower middle income,Angola
3,Europe & Central Asia,Upper middle income,Albania
4,Europe & Central Asia,High income,Andorra


In [14]:
# store data
metadata.to_csv('metadata.csv', index=False)