In [29]:
import numpy as np
import pandas as pd


In [30]:
### Settings ###

usecols_all_states = ['pid', 'idate', 'odate', 'z4type', 'effdate']

for i in range(2, 11):
    usecols_all_states.append(f'z4type{i}')
    usecols_all_states.append(f'effdate{i}')

for i in range(1, 11):
    usecols_all_states.append(f'fips{i}')


dtype_all_states = {
    'pid': 'object',
    'idate': 'float',
    'odate': 'float',
    'z4type': 'category',
    'effdate': 'float',
    'fips1': 'float'
}

for i in range(2, 11):
    dtype_all_states[f'z4type{i}'] = 'category'
    dtype_all_states[f'effdate{i}'] = 'float'
    dtype_all_states[f'fips{i}'] = 'float'


na_values_all_states = "Not in California"


list_dict_col_names = [
    {
        'pid': 'pid',
        'idate': 'idate',
        'odate': 'odate',
        'z4type': 'z4type',
        'effdate': 'effdate',
        'fips1': 'fips',
        'seentime': 'seentime',
    }
]

for i in range(2, 11):
    list_dict_col_names.append(
        {
            'pid': 'pid',
            'idate': 'idate',
            'odate': 'odate',
            f'z4type{i}': 'z4type',
            f'effdate{i}': 'effdate',
            f'fips{i}': 'fips',
            'seentime': 'seentime',
        }
    )


z4types = ('S', 'H', 'R')

Main changes:
* Added period 0
* originfips=prev non-null fips &rarr; originfips=prev address_observation fips
  - Causes different moves to be filtered out
* Writes all moves to allmoves.csv
* period=0 &rarr; period=''
* originfips="first record" &rarr; originfips=-1
* prev_effdate=nan &rarr; prev_effdate=-1
* Removed fips leading zeros
* Removed decimal points

Performance:

Process Data-allstates-Three Periods.ipynb lowest of three runs: 0.9s\
&darr;\
Process Data-allstates-Three Periods v2.ipynb lowest of three runs: 0.2s

In [31]:
### Create data/allmoves.csv ###

# TODO: Use read_csv(chunksize=)/dask

# Import columns of interest
df_all_states = pd.read_csv(
    "data/all_states.csv", 
    usecols=usecols_all_states,
    dtype=dtype_all_states,
    na_values=na_values_all_states
)

# Calculate seentime
df_all_states['seentime'] = df_all_states['odate'] - df_all_states['idate']

# Split all_states into individual moves
list_df_all_moves = [
    df_all_states[list_dict_col_names[i].keys()].rename(
        columns=list_dict_col_names[i]
    ) for i in range(10)
]

# Add address_observation column
for i in range(len(list_df_all_moves)):
    list_df_all_moves[i].insert(0, 'address_observation', i + 1)

# Interleave list_df_all_moves
df_all_moves = (
    pd.concat(list_df_all_moves)
        .sort_index(kind='stable')
        .reset_index(drop=True)
)

# Add previous values
df_all_moves[['originfips', 'prev_effdate']] = df_all_moves.groupby('pid')[
    ['fips', 'effdate']
].shift(fill_value=-1)

# Filter df_all_moves
df_all_moves = df_all_moves.dropna(subset=['fips', 'originfips'])
df_all_moves = df_all_moves[df_all_moves['z4type'].isin(z4types)]

# Categorize periods
condList_period = [
    df_all_moves['effdate'].between(201301, 201801, inclusive='left'),
    df_all_moves['effdate'].between(200801, 201301, inclusive='left'),
    df_all_moves['effdate'].between(200301, 200801, inclusive='left'),
]
choiceList_period = ['2', '1', '0']
df_all_moves.insert(
    7,
    'period',
    pd.Categorical(np.select(condList_period, choiceList_period, default=''))
)

# Write data/allmoves.csv
df_all_moves.to_csv("data/allmoves_v2.csv", float_format="%.0f", index=False)

In [37]:
# Extract period 0 fips and originfips
# HACK begin
df_period0 = df_all_moves[df_all_moves['period'] == '0'][['fips', 'originfips']]

df_period2 = df_all_moves[df_all_moves['period'] == '2'][['fips', 'originfips']]
# end HACK

0        6037401312.0
10       6037900704.0
20       6037134800.0
21       6037107020.0
30       6073019301.0
             ...     
49972    6037228800.0
49980    6111007607.0
49981    6037408212.0
49990    6059110110.0
49991    6059110111.0
Name: fips, Length: 6090, dtype: object


In [33]:
### More settings ###

usecols_fips_tract = ['tractid_fips', 'gainers', 'losers']

dtype_fips_tract = {
    'tractid_fips': 'float',
    'gainers': 'bool',
    'losers': 'bool',
}

In [53]:
### Load fips gainers and losers ###

df_fips_tract = pd.read_csv(
    "fips_tracts_cats.csv",
    usecols=usecols_fips_tract,
    dtype=dtype_fips_tract
)

se_gainers = df_fips_tract[df_fips_tract['gainers']]['tractid_fips']
print(se_gainers.head)
se_losers = df_fips_tract[df_fips_tract['losers']]['tractid_fips']

<bound method NDFrame.head of 1881    6.037502e+09
1882    6.037274e+09
1883    6.037572e+09
1884    6.037544e+09
1885    6.037481e+09
            ...     
2914    6.037702e+09
2915    6.037220e+09
2916    6.037223e+09
2917    6.037206e+09
2918    6.037208e+09
Name: tractid_fips, Length: 746, dtype: float64>


In [59]:
df_period2.groupby('fips', as_index=False).size()

Unnamed: 0,fips,size
0,6.025011e+09,3
1,6.025011e+09,1
2,6.037101e+09,3
3,6.037101e+09,1
4,6.037101e+09,1
...,...,...
2468,6.111008e+09,3
2469,6.111008e+09,1
2470,6.111008e+09,1
2471,6.111008e+09,2


In [66]:
pd.merge(se_gainers.rename('fips'), df_period2.groupby('fips', as_index=False).size(), how='left')

Unnamed: 0,fips,size
0,6.037502e+09,2.0
1,6.037274e+09,
2,6.037572e+09,1.0
3,6.037544e+09,
4,6.037481e+09,
...,...,...
741,6.037702e+09,1.0
742,6.037220e+09,2.0
743,6.037223e+09,
744,6.037206e+09,
