In [1]:
import requests
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d


In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_historical_population'
html = requests.get(url).content
table = pd.read_html(html)

In [3]:
# remove all notes from tables to be merged
table[0].replace(r"\s?\[note \d+\]","",regex = True, inplace = True)
table[2].replace(r"\s?\[note \d+\]","",regex = True, inplace = True)
table[3].replace(r"\s?\[note \d+\]","",regex = True, inplace = True)


In [4]:
#merge tables together
population_by_state = table[0].merge(table[2],on = 'Name', how = 'outer')
population_by_state = population_by_state.merge(table[3],on = 'Name', how = 'outer')

In [15]:
# remove ","s from numbers in population
population_by_state.replace(r",","", regex = True, inplace = True) 

# remove non-states
remove_states = population_by_state.query("Name in ['Pacific Trust Territory','Palau','Panama Canal Zone', 'U.S. Virgin Islands','American Samoa','Northern Mariana Islands', 'United States','Guam', 'Puerto Rico']")
population_by_state.drop(remove_states.index, inplace = True)

In [6]:
# create function to extrapolate population for election years
x = population_by_state.columns[2:].astype('int')
population_by_state['pop_func'] = [interp1d(x, y, fill_value = 'extrapolate')\
                                   for y in population_by_state[population_by_state.columns[2:]].to_numpy()]



In [7]:
# populate election years
for year in range(1788,2020,4):
    population_by_state[str(year)] = [f(year) for f in population_by_state['pop_func']]

In [16]:
key_years = list(map(str,range(1788,2020,4)))
new_df = population_by_state.melt(id_vars = 'Name', value_vars = key_years, var_name = 'year', value_name='pop')

In [17]:
new_df.rename(columns = {'Name':'state'}, inplace = True)

In [18]:
new_df.to_csv('data/state_populations.csv',index = False)

In [20]:
len(new_df.state.unique())

51