# Process the UN medium-fertility projection dataset
This notebook was used to extract the data I want from the UN projections. I downloaded the excel sheet from [https://population.un.org/wpp/Download/Standard/MostUsed/], then exported the second sheet as a csv. This notebook does the remainder of the processing to spit out the file `population_projections.csv`

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('un_medium_projections_raw.csv', skiprows=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], index_col=0, low_memory=False)
df

In [None]:
df.columns

In [None]:
df_filtered = df[["Region, subregion, country or area *", "Type", "Year", "Total Population, as of 1 January (thousands)"]]
df_filtered

In [None]:
df_filtered["Type"].value_counts()

In [None]:
# only keep Country/Area
df_filtered = df_filtered[df_filtered["Type"] == "Country/Area"]
df_filtered

In [None]:
df_filtered = df_filtered.drop(columns=["Type"])
df_filtered

In [None]:
# Cast Year to int
df_filtered["Year"] = df_filtered["Year"].astype(int)

# Turn the country, year columns into a multiindex
df_filtered = df_filtered.set_index(["Region, subregion, country or area *", "Year"])
df_filtered.index.names = ["Country", "Year"]

# Rename the population column
df_filtered = df_filtered.rename(columns={"Total Population, as of 1 January (thousands)": "Population"})


In [None]:

# Strip whitespace from the population column
df_filtered["Population"] = df_filtered["Population"].str.strip()

# Remove spaces within numbers in the population column
df_filtered["Population"] = df_filtered["Population"].str.replace(" ", "")

# Parse the population column as an int
df_filtered["Population"] = df_filtered["Population"].astype(int)

df_filtered

In [None]:
# Save the cleaned data
df_filtered.to_csv("population_projections.csv")