In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle
import json

In [3]:
dtypes = {'cfips': str}

data = pd.read_csv('pop_and_5_10_yr_change.csv', dtype=dtypes)
data.head()

Unnamed: 0,cfips,pop_2010,pop_2015,pop_2020,pop_5yr_pct_chg,pop_10yr_pct_chg
0,1001,54761.0,54903.0,56145.0,2.262171,2.527346
1,1003,183121.0,203101.0,229287.0,12.893093,25.210653
2,1005,27325.0,26300.0,24589.0,-6.505703,-10.012809
3,1007,22858.0,22553.0,22136.0,-1.848978,-3.158632
4,1009,57372.0,57535.0,57879.0,0.597897,0.883706


In [5]:
# check if there are any null values
data.isnull().sum()

cfips               0
pop_2010            1
pop_2015            1
pop_2020            1
pop_5yr_pct_chg     1
pop_10yr_pct_chg    1
dtype: int64

In [6]:
# show where the null values are
data[data.isnull().sum(axis=1) > 0]

Unnamed: 0,cfips,pop_2010,pop_2015,pop_2020,pop_5yr_pct_chg,pop_10yr_pct_chg
92,2261,,,,,


### Found data on Google

In [10]:
# impute the row with my own values
data.loc[92, 'pop_2010'] = 9596
data.loc[92, 'pop_2015'] = 9617
data.loc[92, 'pop_2020'] = 9243
data.loc[92, 'pop_5yr_pct_chg'] = (9617-9596)/9596
data.loc[92, 'pop_10yr_pct_chg'] = (9243-9596)/9596

# show the row 92
data.loc[92]

cfips                  02261
pop_2010              9596.0
pop_2015              9617.0
pop_2020              9243.0
pop_5yr_pct_chg     0.002188
pop_10yr_pct_chg   -0.036786
Name: 92, dtype: object

In [11]:
# show where the null values are
data[data.isnull().sum(axis=1) > 0]

Unnamed: 0,cfips,pop_2010,pop_2015,pop_2020,pop_5yr_pct_chg,pop_10yr_pct_chg


In [12]:
# extract data (non cfips column) into numpy array
data_np = data.iloc[:, 1:].to_numpy()
data_np.shape

(3142, 5)

In [13]:
# repeat the data_np array 47 times 
# into shape (47, 5, 3142) 

data_np = np.repeat(data_np[np.newaxis, :, :], 47, axis=0)
data_np.shape

(47, 3142, 5)

In [14]:
# swap the axis so that the shape is (3142, 5, 47)
data_np = np.swapaxes(data_np, 1, 2)
data_np.shape

(47, 5, 3142)

In [19]:
# check if slice match up with the original data
data_np[0, :, 0]

array([5.47610000e+04, 5.49030000e+04, 5.61450000e+04, 2.26217147e+00,
       2.52734610e+00])

In [20]:
data_np[0, :, 1]

array([1.83121000e+05, 2.03101000e+05, 2.29287000e+05, 1.28930926e+01,
       2.52106531e+01])

In [21]:
data_np[0:5, :, 1]

array([[1.83121000e+05, 2.03101000e+05, 2.29287000e+05, 1.28930926e+01,
        2.52106531e+01],
       [1.83121000e+05, 2.03101000e+05, 2.29287000e+05, 1.28930926e+01,
        2.52106531e+01],
       [1.83121000e+05, 2.03101000e+05, 2.29287000e+05, 1.28930926e+01,
        2.52106531e+01],
       [1.83121000e+05, 2.03101000e+05, 2.29287000e+05, 1.28930926e+01,
        2.52106531e+01],
       [1.83121000e+05, 2.03101000e+05, 2.29287000e+05, 1.28930926e+01,
        2.52106531e+01]])

In [18]:
data.head()

Unnamed: 0,cfips,pop_2010,pop_2015,pop_2020,pop_5yr_pct_chg,pop_10yr_pct_chg
0,1001,54761.0,54903.0,56145.0,2.262171,2.527346
1,1003,183121.0,203101.0,229287.0,12.893093,25.210653
2,1005,27325.0,26300.0,24589.0,-6.505703,-10.012809
3,1007,22858.0,22553.0,22136.0,-1.848978,-3.158632
4,1009,57372.0,57535.0,57879.0,0.597897,0.883706


In [22]:
data_name = 'pop_related'
data_description = {'feature_0' : 'pop_2010',
                    'feature_1' : 'pop_2015',   
                    'feature_2' : 'pop_2020',
                    'feature_3' : 'pop_5yr_pct_chg',
                    'feature_4' : 'pop_10yr_pct_chg'}

# save data in pickle format
with open(f'BLOCKED_{data_name}_data.pkl', 'wb') as f:
    pickle.dump(data_np, f)

# save data in json format
with open(f'BLOCKED_{data_name}_data_description.json', 'w') as f:
    json.dump(data_description, f)