In [2]:
import numpy as np
import pandas as pd


In [3]:
### Settings ###

dtype_all_states = {
    'pid': 'object',
    'idate': 'float',
    'odate': 'float',
    'z4type': 'category',
    'effdate': 'float',
    'fips1': 'float'
}

for i in range(2, 11):
    dtype_all_states[f'z4type{i}'] = 'category'
    dtype_all_states[f'effdate{i}'] = 'float'
    dtype_all_states[f'fips{i}'] = 'float'


usecols_all_states = ['pid', 'idate', 'odate', 'z4type', 'effdate']

for i in range(2, 11):
    usecols_all_states.append(f'z4type{i}')
    usecols_all_states.append(f'effdate{i}')

for i in range(1, 11):
    usecols_all_states.append(f'fips{i}')


list_dict_col_names = [
    {
        'pid': 'pid',
        'idate': 'idate',
        'odate': 'odate',
        'z4type': 'z4type',
        'effdate': 'effdate',
        'fips1': 'fips',
        'period': 'period',
        'seentime': 'seentime',
    }
]

for i in range(2, 11):
    list_dict_col_names.append(
        {
            'pid': 'pid',
            'idate': 'idate',
            'odate': 'odate',
            f'z4type{i}': 'z4type',
            f'effdate{i}': 'effdate',
            f'fips{i}': 'fips',
            'period': 'period',
            'seentime': 'seentime',
        }
    )


z4types = ('S', 'H', 'R')

In [4]:
### Create data/allmoves.csv ###

# TODO: Use read_csv(chunksize=)/dask

# Import columns of interest
df_all_states = pd.read_csv(
    "data/all_states.csv", 
    usecols=usecols_all_states,
    dtype=dtype_all_states
)

# Categorize periods
condList_period = [
    df_all_states['effdate'].between(201301, 201801, inclusive='left'),
    df_all_states['effdate'].between(200801, 201301, inclusive='left'),
    df_all_states['effdate'].between(200301, 200801, inclusive='left'),
]
choiceList_period = ['Period 2', 'Period 1', 'Period 0']
df_all_states['period'] = pd.Categorical(
    np.select(condList_period, choiceList_period, default='')
)

# Calculate seentime
df_all_states['seentime'] = df_all_states['odate'] - df_all_states['idate']

# Split all_states into individual moves
list_df_all_moves = [
    df_all_states[list_dict_col_names[i].keys()].rename(
        columns=list_dict_col_names[i]
    ) for i in range(10)
]

# Add address_observation column
for i in range(len(list_df_all_moves)):
    list_df_all_moves[i].insert(0, 'address_observation', i + 1)

# Interleave list_df_all_moves
df_all_moves = pd.concat(list_df_all_moves).sort_index().reset_index(drop=True)

# originfips


# prev_effdate


# Filter df_all_moves
df_all_moves = df_all_moves.dropna(subset=['fips'])
df_all_moves = df_all_moves[df_all_moves['z4type'].isin(z4types)]

# Write data/allmoves.csv
df_all_moves.to_csv("data/allmoves.csv", float_format="%.0f", index=False)

pid            object
idate         float64
odate         float64
z4type       category
effdate       float64
z4type2      category
effdate2      float64
z4type3      category
effdate3      float64
z4type4      category
effdate4      float64
z4type5      category
effdate5      float64
z4type6      category
effdate6      float64
z4type7      category
effdate7      float64
z4type8      category
effdate8      float64
z4type9      category
effdate9      float64
z4type10     category
effdate10     float64
fips1         float64
fips2         float64
fips3         float64
fips4         float64
fips5         float64
fips6         float64
fips7         float64
fips8         float64
fips9         float64
fips10        float64
period       category
seentime      float64
dtype: object


In [None]:
### Extract Period 0 directly ###
