In [12]:
import pandas as pd
import glob
import os


In [13]:
# State code to state name mapping
dict_state_mapping = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
    'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
    'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
    'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
    'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
    'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
    'WI': 'Wisconsin', 'WY': 'Wyoming', 'DC': 'District of Columbia'
}

# Read all CSV files from data/pop_raw into df_dict
path_pop_raw = './data/pop_raw/'
list_csv_files = glob.glob(os.path.join(path_pop_raw, '*.csv'))

df_dict = {}
for file_path in list_csv_files:
    file_name = os.path.basename(file_path)
    state_code = file_name.replace('POP.csv', '')
    df_dict[state_code] = pd.read_csv(file_path)


In [14]:
# Aggregate all state dataframes into one
list_dfs = []
for state_code, df in df_dict.items():
    df_temp = df.copy()
    df_temp['state_code'] = state_code
    df_temp['state'] = dict_state_mapping[state_code]
    df_temp.rename(columns={f'{state_code}POP': 'population'}, inplace=True)
    df_temp['observation_date'] = pd.to_datetime(df_temp['observation_date'])
    df_temp['date'] = df_temp['observation_date'].dt.strftime('%Y/%m')
    df_temp.drop(columns=['observation_date'], inplace=True)
    list_dfs.append(df_temp)

df_pop_aggregated = pd.concat(list_dfs, ignore_index=True)


In [15]:
df_pop_aggregated.head()


Unnamed: 0,population,state_code,state,date
0,1754.0,MN,Minnesota,1900/01
1,1805.0,MN,Minnesota,1901/01
2,1862.0,MN,Minnesota,1902/01
3,1901.0,MN,Minnesota,1903/01
4,1945.0,MN,Minnesota,1904/01


In [16]:
path_output = './data/pop/'
os.makedirs(path_output, exist_ok=True)
df_pop_aggregated.to_csv(os.path.join(path_output, 'pop_aggregated.csv'), index=False)
