In [1]:
import numpy as np
import pandas as pd


Main changes:
* Added period 0
* originfips=prev non-null fips &rarr; originfips=prev address_observation fips
  - Causes different moves to be filtered out
* Writes all moves to allmoves.csv
* period=0 &rarr; period=''
* originfips="first record" &rarr; originfips=-1
* prev_effdate=nan &rarr; prev_effdate=-1
* Removed fips leading zeros
* Removed decimal points

Performance:

Process Data-allstates-Three Periods.ipynb lowest of three runs: 0.9s\
&darr;\
Process Data-allstates-Three Periods v2.ipynb lowest of three runs: 0.2s

In [31]:
### Create data/allmoves.csv ###

# TODO: Use read_csv(chunksize=)/dask

# Options
z4types = ('S', 'H', 'R')

# Settings
usecols_all_states = ['pid', 'idate', 'odate', 'z4type', 'effdate']
for i in range(2, 11):
    usecols_all_states.append(f'z4type{i}')
    usecols_all_states.append(f'effdate{i}')
for i in range(1, 11):
    usecols_all_states.append(f'fips{i}')

dtype_all_states = {
    'pid': 'object',
    'idate': 'float',
    'odate': 'float',
    'z4type': 'category',
    'effdate': 'float',
    'fips1': 'float',
}
for i in range(2, 11):
    dtype_all_states[f'z4type{i}'] = 'category'
    dtype_all_states[f'effdate{i}'] = 'float'
    dtype_all_states[f'fips{i}'] = 'float'

na_values_all_states = "Not in California"

list_dict_col_names = [
    {
        'pid': 'pid',
        'idate': 'idate',
        'odate': 'odate',
        'z4type': 'z4type',
        'effdate': 'effdate',
        'fips1': 'fips',
        'seentime': 'seentime',
    }
]
for i in range(2, 11):
    list_dict_col_names.append(
        {
            'pid': 'pid',
            'idate': 'idate',
            'odate': 'odate',
            f'z4type{i}': 'z4type',
            f'effdate{i}': 'effdate',
            f'fips{i}': 'fips',
            'seentime': 'seentime',
        }
    )

# Import columns of interest
df_all_states = pd.read_csv(
    "data/all_states.csv", 
    usecols=usecols_all_states,
    dtype=dtype_all_states,
    na_values=na_values_all_states
)

# Calculate seentime
df_all_states['seentime'] = df_all_states['odate'] - df_all_states['idate']

# Split all_states into individual moves
list_df_all_moves = [
    df_all_states[list_dict_col_names[i].keys()].rename(
        columns=list_dict_col_names[i]
    ) for i in range(10)
]

# Add address_observation column
for i in range(len(list_df_all_moves)):
    list_df_all_moves[i].insert(0, 'address_observation', i + 1)

# Interleave list_df_all_moves
df_all_moves = (
    pd.concat(list_df_all_moves)
        .sort_index(kind='stable')
        .reset_index(drop=True)
)

# Add previous values
df_all_moves[['originfips', 'prev_effdate']] = df_all_moves.groupby('pid')[
    ['fips', 'effdate']
].shift(fill_value=-1)

# Filter df_all_moves
df_all_moves = df_all_moves.dropna(subset=['fips', 'originfips'])
df_all_moves = df_all_moves[df_all_moves['z4type'].isin(z4types)]

# Categorize periods
condList_period = [
    df_all_moves['effdate'].between(201301, 201801, inclusive='left'),
    df_all_moves['effdate'].between(200801, 201301, inclusive='left'),
    df_all_moves['effdate'].between(200301, 200801, inclusive='left'),
]
choiceList_period = ['2', '1', '0']
df_all_moves.insert(
    7,
    'period',
    pd.Categorical(np.select(condList_period, choiceList_period, default=''))
)

# Write data/allmoves.csv
df_all_moves.to_csv("data/allmoves_v2.csv", float_format="%.0f", index=False)

In [12]:
### Load fips gainers and losers ###

# Settings
usecols_fips_tract = ['tractid_fips', 'gainers', 'losers']

dtype_fips_tract = {
    'tractid_fips': 'float',
    'gainers': 'bool',
    'losers': 'bool',
}

# Import fips_tracts_cats.csv
df_fips_tract = pd.read_csv(
    "fips_tracts_cats.csv",
    usecols=usecols_fips_tract,
    dtype=dtype_fips_tract
)

# Get Series of gainers and losers
se_gainers = df_fips_tract[df_fips_tract['gainers']]['tractid_fips']
se_losers = df_fips_tract[df_fips_tract['losers']]['tractid_fips']

In [13]:
# HACK begin

# Settings
arg_all_moves = {
    'usecols': ['period', 'fips', 'originfips'],
    'dtype': {'period': 'category', 'fips': 'float', 'originfips': 'float'},
    'na_values': "nan",
}

# Load allmoves
df_all_moves = pd.read_csv("data/allmoves_v2.csv", **arg_all_moves)

# Extract period 0 fips and originfips
df_period0 = df_all_moves[df_all_moves['period'] == '0'][['fips', 'originfips']]
df_period1 = df_all_moves[df_all_moves['period'] == '1'][['fips', 'originfips']]
df_period2 = df_all_moves[df_all_moves['period'] == '2'][['fips', 'originfips']]

# end HACK

Main changes:
* Remove leading zeros

In [14]:
# TODO: Test efficiency

# Settings
arg_to_csv = {
    'na_rep': "0",
    'float_format': "%.0f",
    'header': False,
    'index': False,
}

# Count fips
df_gain_total0 = pd.merge(
    se_gainers.rename('fips'),
    df_period0.groupby('fips', as_index=False).size(),
    how='left'
)
df_loss_total0 = pd.merge(
    se_losers.rename('originfips'),
    df_period0.groupby('originfips', as_index=False).size(),
    how='left'
)

df_gain_total1 = pd.merge(
    se_gainers.rename('fips'),
    df_period1.groupby('fips', as_index=False).size(),
    how='left'
)
df_loss_total1 = pd.merge(
    se_losers.rename('originfips'),
    df_period1.groupby('originfips', as_index=False).size(),
    how='left'
)

df_gain_total2 = pd.merge(
    se_gainers.rename('fips'),
    df_period2.groupby('fips', as_index=False).size(),
    how='left'
)
df_loss_total2 = pd.merge(
    se_losers.rename('originfips'),
    df_period2.groupby('originfips', as_index=False).size(),
    how='left'
)

# Write output files
df_gain_total0.to_csv("data/gaintotal_p0.csv", **arg_to_csv)
df_loss_total0.to_csv("data/losstotal_p0.csv", **arg_to_csv)

df_gain_total1.to_csv("data/gaintotal_p1.csv", **arg_to_csv)
df_loss_total1.to_csv("data/losstotal_p1.csv", **arg_to_csv)

df_gain_total2.to_csv("data/gaintotal_p2.csv", **arg_to_csv)
df_loss_total2.to_csv("data/losstotal_p2.csv", **arg_to_csv)