In [2]:
import numpy as np
import pandas as pd


In [3]:
### Settings ###

dtype_all_states = {
    'pid': 'object',
    'idate': 'float',
    'odate': 'float',
    'z4type': 'category',
    'effdate': 'float',
    'fips1': 'float'
}

for i in range(2, 11):
    dtype_all_states[f'z4type{i}'] = 'category'
    dtype_all_states[f'effdate{i}'] = 'float'
    dtype_all_states[f'fips{i}'] = 'float'


usecols_all_states = ['pid', 'idate', 'odate', 'z4type', 'effdate']

for i in range(2, 11):
    usecols_all_states.append(f'z4type{i}')
    usecols_all_states.append(f'effdate{i}')

for i in range(1, 11):
    usecols_all_states.append(f'fips{i}')


list_dict_col_names = [
    {
        'pid': 'pid',
        'idate': 'idate',
        'odate': 'odate',
        'z4type': 'z4type',
        'effdate': 'effdate',
        'fips1': 'fips',
        'seentime': 'seentime',
    }
]

for i in range(2, 11):
    list_dict_col_names.append(
        {
            'pid': 'pid',
            'idate': 'idate',
            'odate': 'odate',
            f'z4type{i}': 'z4type',
            f'effdate{i}': 'effdate',
            f'fips{i}': 'fips',
            'seentime': 'seentime',
        }
    )


z4types = ('S', 'H', 'R')

Main changes:
* Added period 0
* originfips=prev non-null fips &rarr; originfips=prev address_observation fips
  - Causes different moves to be filtered out
* Writes all moves to allmoves.csv
* period=0 &rarr; period=''
* originfips="first record" &rarr; originfips=-1
* prev_effdate=nan &rarr; prev_effdate=-1
* Removed fips leading zeros
* Removed decimal points

Performance:

Process Data-allstates-Three Periods.ipynb lowest of three runs: 0.9s\
&darr;\
Process Data-allstates-Three Periods v2.ipynb lowest of three runs: 0.2s

In [14]:
### Create data/allmoves.csv ###

# TODO: Use read_csv(chunksize=)/dask

# Import columns of interest
df_all_states = pd.read_csv(
    "data/all_states.csv", 
    usecols=usecols_all_states,
    dtype=dtype_all_states
)

# Calculate seentime
df_all_states['seentime'] = df_all_states['odate'] - df_all_states['idate']
print(">>>seentime<<<")
print(df_all_states.head)

# Split all_states into individual moves
list_df_all_moves = [
    df_all_states[list_dict_col_names[i].keys()].rename(
        columns=list_dict_col_names[i]
    ) for i in range(10)
]

# Add address_observation column
for i in range(len(list_df_all_moves)):
    list_df_all_moves[i].insert(0, 'address_observation', i + 1)
for i, df in enumerate(list_df_all_moves):
    print(f">>>df{i + 1}<<<")
    print(df.head)

# Interleave list_df_all_moves
df_all_moves = pd.concat(list_df_all_moves).sort_index(kind='stable').reset_index(drop=True)
print(">>>interleave<<<")
print(df_all_moves.head)

# Add previous values
df_all_moves[['originfips', 'prev_effdate']] = df_all_moves.groupby('pid')[
    ['fips', 'effdate']
].shift(fill_value=-1)
print(">>>prev values<<<")
print(df_all_moves.head)

# Filter df_all_moves
df_all_moves = df_all_moves.dropna(subset=['fips', 'originfips'])
df_all_moves = df_all_moves[df_all_moves['z4type'].isin(z4types)]

# Categorize periods
condList_period = [
    df_all_moves['effdate'].between(201301, 201801, inclusive='left'),
    df_all_moves['effdate'].between(200801, 201301, inclusive='left'),
    df_all_moves['effdate'].between(200301, 200801, inclusive='left'),
]
choiceList_period = ['2', '1', '0']
df_all_moves.insert(
    7,
    'period',
    pd.Categorical(np.select(condList_period, choiceList_period, default=''))
)

# Write data/allmoves.csv
df_all_moves.to_csv("data/allmoves_v2.csv", float_format="%.0f", index=False)

>>>seentime<<<
<bound method NDFrame.head of                 pid       idate       odate z4type   effdate z4type2  \
0   Y39398460922893  20170301.0  20210101.0      S  201703.0       H   
1   Y39398460922808  20170301.0  20210101.0      H  201703.0       H   
2   Y39398460922850  20160501.0  20160501.0      H  201605.0       H   
3   Y39398460922834  20170201.0  20171001.0      S  201707.0       S   
4   Y39398460922655  20170201.0  20200201.0      S  201702.0       S   
..              ...         ...         ...    ...       ...     ...   
95  Y39398460938574  20170301.0  20210101.0      S  202006.0       S   
96  Y39398460938509  20170301.0  20210101.0      S  201703.0       S   
97  Y39398460938264  20170201.0  20190401.0      S  201904.0       H   
98  Y39398460938252  20170301.0  20170301.0      S  201703.0       S   
99  Y39398460938331  20170301.0  20170501.0      H  201703.0       S   

    effdate2 z4type3  effdate3 z4type4  ...         fips2         fips3  \
0   201703.0   

In [5]:
### Extract Period 0 directly ###
