In [44]:
from glob import glob
import numpy as np
import pandas as pd
from pathlib import Path

def dont_dropna(df_areas):
    df_moves = df_areas[['effdate', 'fips']]
    df_moves = df_moves.rename(columns={'fips': 'destfips'})
    df_moves['origfips'] = (
        df_moves.groupby(df_moves.index)['destfips']
        .shift(fill_value="first record")
    )
    return df_moves

def preshift_dropna(df_areas):
    # dropna fips before shifting
    df_moves = df_areas[['effdate', 'fips']].dropna()
    df_moves = df_moves.rename(columns={'fips': 'destfips'})
    df_moves['origfips'] = (
        df_moves.groupby(df_moves.index)['destfips']
        .shift(fill_value="first record")
    )
    return df_moves

def postshift_dropna(df_areas):
    # dropna origfips & destfips
    df_moves = df_areas[['effdate', 'fips']]
    df_moves = df_moves.rename(columns={'fips': 'destfips'})
    df_moves['origfips'] = (
        df_moves.groupby(df_moves.index)['destfips']
        .shift(fill_value="first record")
    )
    df_moves = df_moves.dropna()
    return df_moves

In [45]:
### Options ###
lines_per_chunk = 1_000_000
z4types = ('H', 'S', 'R')
create_moves = dont_dropna
county_codes = ("037", "059")

In [52]:
### Extract all moves ###

# Settings
usecols_all_states = ['z4type', 'effdate']
for i in range(2, 11):
    usecols_all_states.append(f'z4type{i}')
    usecols_all_states.append(f'effdate{i}')
for i in range(1, 11):
    usecols_all_states.append(f'fips{i}')

dtype_all_states = {
    'z4type': 'category',
    'effdate': 'float',
    'fips1': 'object',
}
for i in range(2, 11):
    dtype_all_states[f'z4type{i}'] = 'category'
    dtype_all_states[f'effdate{i}'] = 'float'
    dtype_all_states[f'fips{i}'] = 'object'

list_dict_col_names = [
    {
        'z4type': 'z4type',
        'effdate': 'effdate',
        'fips1': 'fips',
    }
]
for i in range(2, 11):
    list_dict_col_names.append(
        {
            f'z4type{i}': 'z4type',
            f'effdate{i}': 'effdate',
            f'fips{i}': 'fips',
        }
    )


# Load all_states
it_all_states = pd.read_csv(
    "data/all_states.csv", 
    usecols=usecols_all_states,
    dtype=dtype_all_states,
    chunksize=lines_per_chunk
)

In [53]:
def process_chunk(index, chunk, finished_chunks, status_file):
    if index in finished_chunks:
        return f"Chunk {index} done" 

    # Split rows into individual areas
    list_df_all_areas = [
        chunk[list_dict_col_names[i].keys()].rename(
            columns=list_dict_col_names[i]
        ) for i in range(10)
    ]

    # Recombine all areas + sort by effdate
    df_all_areas = (
        pd.concat(list_df_all_areas)
            .dropna(subset=['effdate'])
            .sort_values('effdate', kind='stable')
    )

    # All effdates that do not have a z4type in z4types
    bi_all_dropped = ~(
        df_all_areas['z4type'].isin(z4types)
            .groupby([df_all_areas.index, df_all_areas['effdate']])
            .transform('any')
    )

    # Change values so selected effdates are not removed
    df_all_areas.loc[bi_all_dropped, 'z4type'] = 'empty'
    df_all_areas.loc[bi_all_dropped, 'fips'] = ""

    # Filter by Zip+4 type
    z4types_mask = (*z4types, 'empty')

    df_filtered_areas = df_all_areas[df_all_areas['z4type'].isin(z4types_mask)]

    # Choose leftmost fips of each effdate
    df_filtered_areas = (
        df_filtered_areas.groupby([df_filtered_areas.index, 'effdate']).first()
    )
    df_filtered_areas = df_filtered_areas.reset_index('effdate')

    # Link previous & next areas as moves
    df_all_moves = create_moves(df_filtered_areas)

    # Filter by county code
    df_filtered_moves = df_all_moves[
        df_all_moves['origfips'].str[2:5].isin(county_codes) |
            df_all_moves['destfips'].str[2:5].isin(county_codes)
    ]
    
    # Separate moves into periods
    extract_period = lambda start, stop: df_filtered_moves[
        df_filtered_moves['effdate'].between(start, stop, inclusive='left')
    ][['origfips', 'destfips']]

    df_03_07_moves = extract_period(200301, 200801)
    df_08_12_moves = extract_period(200801, 201301)
    df_13_17_moves = extract_period(201301, 201801)

    # Write to files
    df_03_07_moves.to_csv(
        f"data/direct/03_07_moves-{lines_per_chunk}/{index}.csv", index=False
    )
    df_08_12_moves.to_csv(
        f"data/direct/08_12_moves-{lines_per_chunk}/{index}.csv", index=False
    )
    df_13_17_moves.to_csv(
        f"data/direct/13_17_moves-{lines_per_chunk}/{index}.csv", index=False
    )

    # Update status
    status_file.write(f"{index}\n")

    return f"Chunk {index} done"


# Create status file
try:
    f = open(f"direct_status-{lines_per_chunk}.txt")
except FileNotFoundError:
    f = open(f"direct_status-{lines_per_chunk}.txt", 'a')
else:
    finished_chunks = {int(line) for line in f.readlines()}
finally:
    f.close()

# Create chunk folders
Path(f"data/direct/03_07_moves-{lines_per_chunk}").mkdir(
    parents=True, exist_ok=True
)
Path(f"data/direct/08_12_moves-{lines_per_chunk}").mkdir(
    parents=True, exist_ok=True
)
Path(f"data/direct/13_17_moves-{lines_per_chunk}").mkdir(
    parents=True, exist_ok=True
)

# Process all states
with open(f"direct_status-{lines_per_chunk}.txt", 'a') as status_file:
    for index, chunk in enumerate(it_all_states):
        process_chunk(index, chunk, finished_chunks, status_file)

In [55]:
### Load all moves ###

# Settings
dtype_all_moves = {
    'origfips': 'object',
    'destfips': 'object',
}


# Read chunks
df_03_07_moves = pd.concat(
    [
        pd.read_csv(chunk_file, dtype=dtype_all_moves)
        for chunk_file in glob(
            f"data/direct/03_07_moves-{lines_per_chunk}/*.csv"
        )
    ]
)
df_08_12_moves = pd.concat(
    [
        pd.read_csv(chunk_file, dtype=dtype_all_moves)
        for chunk_file in glob(
            f"data/direct/08_12_moves-{lines_per_chunk}/*.csv"
        )
    ]
)
df_13_17_moves = pd.concat(
    [
        pd.read_csv(chunk_file, dtype=dtype_all_moves)
        for chunk_file in glob(
            f"data/direct/13_17_moves-{lines_per_chunk}/*.csv"
        )
    ]
)

In [33]:
### Data analysis ###

# Settings
usecols_fips_tract = ['tractid_fips', 'gainers', 'losers']

dtype_fips_tract = {
    'tractid_fips': 'object',
    'gainers': 'bool',
    'losers': 'bool',
}


# Load fips_tracts_cats
df_fips_tract = pd.read_csv(
    "fips_tracts_cats.csv",
    usecols=usecols_fips_tract,
    dtype=dtype_fips_tract
)

# Get Series of gainers and losers
se_gainers = df_fips_tract[df_fips_tract['gainers']]['tractid_fips']
se_losers = df_fips_tract[df_fips_tract['losers']]['tractid_fips']

In [34]:
# Compute gain & loss totals

def move_totals(df_moves):
    se_gain_total = (
        df_moves.groupby('destfips')
            .size()
            .reindex(se_gainers, fill_value=0)
            .rename('count')
    )
    se_loss_total = (
        df_moves.groupby('origfips')
            .size()
            .reindex(se_losers, fill_value=0)
            .rename('count')
    )
    return se_gain_total, se_loss_total

se_03_07_gain_total, se_03_07_loss_total = move_totals(df_03_07_moves)
se_08_12_gain_total, se_08_12_loss_total = move_totals(df_08_12_moves)
se_13_17_gain_total, se_13_17_loss_total = move_totals(df_13_17_moves)

In [35]:
# Write to files
se_03_07_gain_total.to_csv("data/direct/03_07_gain_total.csv")
se_03_07_loss_total.to_csv("data/direct/03_07_loss_total.csv")

se_08_12_gain_total.to_csv("data/direct/03_07_gain_total.csv")
se_08_12_loss_total.to_csv("data/direct/03_07_loss_total.csv")

se_13_17_gain_total.to_csv("data/direct/03_07_gain_total.csv")
se_13_17_loss_total.to_csv("data/direct/03_07_loss_total.csv")

Note: Almost no origfips to same destfips moves now, likely because duplicate effdates were removed\
Note2: Failed geocoded fips kept

In [36]:
# Compute matrices

def matrix(df_moves):
    se_indices = (
        pd.concat([df_moves['origfips'], df_moves['destfips']])
            .drop_duplicates()
    )

    df_matrix = (
        df_moves.groupby(['origfips', 'destfips'])
            .size()
            .unstack(fill_value=0)
            .reindex(se_indices, columns=se_indices, fill_value=0)
    )

    return df_matrix

df_03_07_matrix = matrix(df_03_07_moves)
df_08_12_matrix = matrix(df_08_12_moves)
df_13_17_matrix = matrix(df_13_17_moves)

In [37]:
# Write to files
df_03_07_matrix.to_csv("data/direct/03_07_matrix.csv")
df_08_12_matrix.to_csv("data/direct/08_12_matrix.csv")
df_13_17_matrix.to_csv("data/direct/13_17_matrix.csv")

In [38]:
# Compute high gains & losses

def high_moves(df_matrix):
    df_zeroed_matrix = df_matrix.copy(deep=True)

    np.fill_diagonal(df_zeroed_matrix.values, 0)

    # Sum across rows and columns
    df_high_loss = df_zeroed_matrix.sum(axis=1).to_frame(name='count')
    df_high_gain = df_zeroed_matrix.sum().to_frame(name='count')

    # Label fips based on gainers & losers
    condList_high_loss = [
        df_high_loss.index.isin(se_gainers), df_high_loss.index.isin(se_losers)
    ]
    condList_high_gain = [
        df_high_gain.index.isin(se_gainers), df_high_gain.index.isin(se_losers)
    ]
    choiceList_high = ['gain', 'loss']

    df_high_loss['type'] = pd.Categorical(
        np.select(condList_high_loss, choiceList_high, default='other')
    )
    df_high_gain['type'] = pd.Categorical(
        np.select(condList_high_gain, choiceList_high, default='other')
    )

    return df_high_loss, df_high_gain

df_03_07_high_loss, df_03_07_high_gain = high_moves(df_03_07_matrix)
df_08_12_high_loss, df_08_12_high_gain = high_moves(df_08_12_matrix)
df_13_17_high_loss, df_13_17_high_gain = high_moves(df_13_17_matrix)

In [39]:
# Write to files
df_03_07_high_loss.to_csv("data/direct/03_07_out_of_high_loss.csv")
df_03_07_high_gain.to_csv("data/direct/03_07_into_high_gain.csv")

df_08_12_high_loss.to_csv("data/direct/08_12_out_of_high_loss.csv")
df_08_12_high_gain.to_csv("data/direct/08_12_into_high_gain.csv")

df_13_17_high_loss.to_csv("data/direct/13_17_out_of_high_loss.csv")
df_13_17_high_gain.to_csv("data/direct/13_17_into_high_gain.csv")

In [40]:
# Compute summaries

def summary(df_high_moves):
    df_summary = pd.DataFrame(columns=['count'])
    
    # Sum each of gainers, losers, and other
    df_summary.loc["from_loss"] = (
        df_high_moves[df_high_moves.index.isin(se_losers)]['count'].sum()
    )
    df_summary.loc["from_gain"] = (
        df_high_moves[df_high_moves.index.isin(se_gainers)]['count'].sum()
    )
    df_summary.loc["from_other"] = (
        df_high_moves['count'].sum()
        - df_summary.loc["from_loss"]
        - df_summary.loc["from_gain"]
    )

    return df_summary

df_03_07_high_loss_summary = summary(df_03_07_high_loss)
df_03_07_high_gain_summary = summary(df_03_07_high_gain)

df_08_12_high_loss_summary = summary(df_08_12_high_loss)
df_08_12_high_gain_summary = summary(df_08_12_high_gain)

df_13_17_high_loss_summary = summary(df_13_17_high_loss)
df_13_17_high_gain_summary = summary(df_13_17_high_gain)

In [41]:
# Write to files
df_03_07_high_loss_summary.to_csv("data/direct/03_07_out_of_high_loss_summary.csv")
df_03_07_high_gain_summary.to_csv("data/direct/03_07_into_high_gain_summary.csv")

df_08_12_high_loss_summary.to_csv("data/direct/08_12_out_of_high_loss_summary.csv")
df_08_12_high_gain_summary.to_csv("data/direct/08_12_into_high_gain_summary.csv")

df_13_17_high_loss_summary.to_csv("data/direct/13_17_out_of_high_loss_summary.csv")
df_13_17_high_gain_summary.to_csv("data/direct/13_17_into_high_gain_summary.csv")