In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

In [3]:
population_estimates = pd.read_csv("Population-EstimatesData.csv")
population_estimates.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2042,2043,2044,2045,2046,2047,2048,2049,2050,Unnamed: 95
0,Arab World,ARB,Age dependency ratio (% of working-age populat...,SP.POP.DPND,88.06111,89.489513,90.782451,91.898948,92.728007,93.20129,...,54.356396,54.502067,54.676725,54.874161,55.102246,55.331887,55.56218,55.789065,56.00945,
1,Arab World,ARB,"Age dependency ratio, old",SP.POP.DPND.OL,6.591951,6.700903,6.793279,6.867327,6.919224,6.94779,...,13.345549,13.732773,14.132766,14.544481,14.946251,15.349197,15.755873,16.168726,16.589418,
2,Arab World,ARB,"Age dependency ratio, young",SP.POP.DPND.YG,81.324186,82.626198,83.789155,84.767354,85.474209,85.872972,...,40.542064,40.345297,40.162781,39.987572,39.845466,39.69929,39.546177,39.380479,39.19756,
3,Arab World,ARB,"Age population, age 00, female, interpolated",SP.POP.AG00.FE.IN,,,,,,,...,,,,,,,,,,
4,Arab World,ARB,"Age population, age 00, male, interpolated",SP.POP.AG00.MA.IN,,,,,,,...,,,,,,,,,,


In [4]:
# Remove unecessary columns from the DataFrame and save the new DataFrame
# Only keep: "Country Name", "Indicator Name", "1970", "2020",
reduced_df = population_estimates[["Country Name", "Indicator Name", "1970", "2020"]]
reduced_df.head()

Unnamed: 0,Country Name,Indicator Name,1970,2020
0,Arab World,Age dependency ratio (% of working-age populat...,94.226666,61.02012
1,Arab World,"Age dependency ratio, old",7.073558,7.59305
2,Arab World,"Age dependency ratio, young",86.848912,51.954725
3,Arab World,"Age population, age 00, female, interpolated",,
4,Arab World,"Age population, age 00, male, interpolated",,


In [5]:
# Set new index to Country Name
df = reduced_df.set_index("Country Name")
df.head()

Unnamed: 0_level_0,Indicator Name,1970,2020
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,Age dependency ratio (% of working-age populat...,94.226666,61.02012
Arab World,"Age dependency ratio, old",7.073558,7.59305
Arab World,"Age dependency ratio, young",86.848912,51.954725
Arab World,"Age population, age 00, female, interpolated",,
Arab World,"Age population, age 00, male, interpolated",,


In [10]:
# Grab the eight regions we'll be using
reduced_df2 = df.loc[["Central Europe and the Baltics", "East Asia & Pacific", "European Union",
                                "Latin America & Caribbean", "Middle East & North Africa", "South Asia", "Sub-Saharan Africa", "United States"]]
reduced_df2

Unnamed: 0_level_0,Indicator Name,1970,2020
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Central Europe and the Baltics,Age dependency ratio (% of working-age populat...,5.237557e+01,5.316262e+01
Central Europe and the Baltics,"Age dependency ratio, old",1.438745e+01,2.976632e+01
Central Europe and the Baltics,"Age dependency ratio, young",3.792280e+01,2.335613e+01
Central Europe and the Baltics,"Age population, age 00, female, interpolated",,
Central Europe and the Baltics,"Age population, age 00, male, interpolated",,
...,...,...,...
United States,Rural population growth (annual %),1.511424e-01,
United States,Sex ratio at birth (male births per female bir...,,
United States,Urban population,1.509224e+08,2.743740e+08
United States,Urban population (% of total population),7.360200e+01,8.266400e+01


In [12]:
#Move the Country Name index back into a column
reduced_df3 = reduced_df2.reset_index(level='Country Name')
reduced_df3

Unnamed: 0,Country Name,Indicator Name,1970,2020
0,Central Europe and the Baltics,Age dependency ratio (% of working-age populat...,5.237557e+01,5.316262e+01
1,Central Europe and the Baltics,"Age dependency ratio, old",1.438745e+01,2.976632e+01
2,Central Europe and the Baltics,"Age dependency ratio, young",3.792280e+01,2.335613e+01
3,Central Europe and the Baltics,"Age population, age 00, female, interpolated",,
4,Central Europe and the Baltics,"Age population, age 00, male, interpolated",,
...,...,...,...,...
1395,United States,Rural population growth (annual %),1.511424e-01,
1396,United States,Sex ratio at birth (male births per female bir...,,
1397,United States,Urban population,1.509224e+08,2.743740e+08
1398,United States,Urban population (% of total population),7.360200e+01,8.266400e+01


In [13]:
#Makes sure Country Name is now a column
reduced_df3.columns

Index(['Country Name', 'Indicator Name', '1970', '2020'], dtype='object')

In [14]:
# Set new index to Indicator Name
reduced_df4 = reduced_df3.set_index("Indicator Name")
reduced_df4.head()

Unnamed: 0_level_0,Country Name,1970,2020
Indicator Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Age dependency ratio (% of working-age population),Central Europe and the Baltics,52.375566,53.162625
"Age dependency ratio, old",Central Europe and the Baltics,14.387453,29.766316
"Age dependency ratio, young",Central Europe and the Baltics,37.922797,23.356127
"Age population, age 00, female, interpolated",Central Europe and the Baltics,,
"Age population, age 00, male, interpolated",Central Europe and the Baltics,,


In [19]:
# Grab the two Indicator Names we'll be using
reduced_df5 = reduced_df4.loc[["Rural population", "Urban population"]]
reduced_df5

Unnamed: 0_level_0,Country Name,1970,2020
Indicator Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rural population,Central Europe and the Baltics,48793751.0,38141000.0
Rural population,East Asia & Pacific,968401382.0,924184900.0
Rural population,European Union,151721048.0,122689000.0
Rural population,Latin America & Caribbean,122226648.0,123659800.0
Rural population,Middle East & North Africa,79408841.0,158067000.0
Rural population,South Asia,580691836.0,1209094000.0
Rural population,Sub-Saharan Africa,237868093.0,667404000.0
Rural population,United States,54129627.0,57541000.0
Urban population,Central Europe and the Baltics,50340797.0,63708000.0
Urban population,East Asia & Pacific,321982682.0,1427118000.0


In [30]:
#check the datatypes
reduced_df5.dtypes

Country Name    object
1970             int64
2020             int64
dtype: object

In [31]:
#convert the 1970 column and 2020 column to integers rather than floats
cols = ['1970', '2020']
reduced_df5[cols] = reduced_df5[cols].applymap(np.int64)
reduced_df5

Unnamed: 0_level_0,Country Name,1970,2020
Indicator Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rural population,Central Europe and the Baltics,48793751,38141000
Rural population,East Asia & Pacific,968401382,924184900
Rural population,European Union,151721048,122689000
Rural population,Latin America & Caribbean,122226648,123659800
Rural population,Middle East & North Africa,79408841,158067000
Rural population,South Asia,580691836,1209094000
Rural population,Sub-Saharan Africa,237868093,667404000
Rural population,United States,54129627,57541000
Urban population,Central Europe and the Baltics,50340797,63708000
Urban population,East Asia & Pacific,321982682,1427118500


In [32]:
#Move the Indicator Name index back into a column
reduced_df6 = reduced_df5.reset_index(level='Indicator Name')
reduced_df6

Unnamed: 0,Indicator Name,Country Name,1970,2020
0,Rural population,Central Europe and the Baltics,48793751,38141000
1,Rural population,East Asia & Pacific,968401382,924184900
2,Rural population,European Union,151721048,122689000
3,Rural population,Latin America & Caribbean,122226648,123659800
4,Rural population,Middle East & North Africa,79408841,158067000
5,Rural population,South Asia,580691836,1209094000
6,Rural population,Sub-Saharan Africa,237868093,667404000
7,Rural population,United States,54129627,57541000
8,Urban population,Central Europe and the Baltics,50340797,63708000
9,Urban population,East Asia & Pacific,321982682,1427118500


In [34]:
# Reorganizing the columns where Country Name comes before Indicator Name
organized_df = reduced_df6[["Country Name","Indicator Name","1970","2020"]]
organized_df.head()

Unnamed: 0,Country Name,Indicator Name,1970,2020
0,Central Europe and the Baltics,Rural population,48793751,38141000
1,East Asia & Pacific,Rural population,968401382,924184900
2,European Union,Rural population,151721048,122689000
3,Latin America & Caribbean,Rural population,122226648,123659800
4,Middle East & North Africa,Rural population,79408841,158067000


In [36]:
# Push the remade DataFrame to a new CSV file
organized_df.to_csv("Population-Estimates_Emily.csv",
                  index=False, header=True)