In [4]:
from glob import glob
import numpy as np
import pandas as pd
from pathlib import Path

def dont_dropna(df_areas):
    df_moves = df_areas[['effdate', 'fips']]
    df_moves = df_moves.rename(
        columns={'effdate': 'date_left', 'fips': 'dest_fips'}
    )
    df_moves[['date_arrived', 'orig_fips']] = (
        df_moves.groupby(df_moves.index)[['date_left', 'dest_fips']]
            .shift()
    )
    return df_moves

def preshift_dropna(df_areas):
    # dropna fips before shifting
    df_moves = df_areas[['effdate', 'fips']].dropna()
    df_moves = df_moves.rename(
        columns={'effdate': 'date_left', 'fips': 'dest_fips'}
    )
    df_moves[['date_arrived', 'orig_fips']] = (
        df_moves.groupby(df_moves.index)[['date_left', 'dest_fips']]
            .shift()
    )
    return df_moves

def postshift_dropna(df_areas):
    # dropna after shifting
    df_moves = df_areas[['effdate', 'fips']]
    df_moves = df_moves.rename(
        columns={'effdate': 'date_left', 'fips': 'dest_fips'}
    )
    df_moves[['date_arrived', 'orig_fips']] = (
        df_moves.groupby(df_moves.index)[['date_left', 'dest_fips']]
            .shift()
    )
    df_moves = df_moves.dropna()
    return df_moves

In [6]:
### Options ###
lines_per_chunk = 1_250_000
z4types = ('H', 'S', 'R')
create_moves = dont_dropna
county_codes = ("037", "059")

In [9]:
### Extract all moves ###

# Settings
usecols_all_states = ['z4type', 'effdate']
for i in range(2, 11):
    usecols_all_states.append(f'z4type{i}')
    usecols_all_states.append(f'effdate{i}')
for i in range(1, 11):
    usecols_all_states.append(f'fips{i}')

dtype_all_states = {
    'z4type': 'category',
    'effdate': 'float',
    'fips1': 'object',
}
for i in range(2, 11):
    dtype_all_states[f'z4type{i}'] = 'category'
    dtype_all_states[f'effdate{i}'] = 'float'
    dtype_all_states[f'fips{i}'] = 'object'

list_dict_col_names = [
    {
        'z4type': 'z4type',
        'effdate': 'effdate',
        'fips1': 'fips',
    }
]
for i in range(2, 11):
    list_dict_col_names.append(
        {
            f'z4type{i}': 'z4type',
            f'effdate{i}': 'effdate',
            f'fips{i}': 'fips',
        }
    )


# Load all_states
it_all_states = pd.read_csv(
    "data/all_states.csv", 
    usecols=usecols_all_states,
    dtype=dtype_all_states,
    chunksize=lines_per_chunk
)

In [10]:
# Create status file
try:
    f = open(f"inter_status-{lines_per_chunk}.txt")
except FileNotFoundError:
    f = open(f"inter_status-{lines_per_chunk}.txt", 'a')
    finished_chunks = set()
else:
    finished_chunks = {int(line) for line in f.readlines()}
finally:
    f.close()

# Create chunk folders
Path(f"data/inter/moves-{lines_per_chunk}").mkdir(
    parents=True, exist_ok=True
)
Path(f"data/inter/08_17_moves-{lines_per_chunk}").mkdir(
    parents=True, exist_ok=True
)
Path(f"data/inter/03_12_moves-{lines_per_chunk}").mkdir(
    parents=True, exist_ok=True
)
Path(f"data/inter/03_17_moves-{lines_per_chunk}").mkdir(
    parents=True, exist_ok=True
)

In [11]:
# Process all states

def process_chunk(index, chunk):
    if index in finished_chunks:
        return f"Chunk {index} done" 

    # Split rows into individual areas
    list_df_all_areas = [
        chunk[list_dict_col_names[i].keys()].rename(
            columns=list_dict_col_names[i]
        ) for i in range(10)
    ]

    # Recombine all areas + sort by effdate
    df_all_areas = (
        pd.concat(list_df_all_areas)
            .dropna(subset=['effdate'])
            .sort_values('effdate', kind='stable')
    )

    # All effdates that do not have a z4type in z4types
    bi_all_dropped = ~(
        df_all_areas['z4type'].isin(z4types)
            .groupby([df_all_areas.index, df_all_areas['effdate']])
            .transform('any')
    )

    # Change values so selected effdates are not removed
    df_all_areas.loc[bi_all_dropped, 'z4type'] = 'empty'
    df_all_areas.loc[bi_all_dropped, 'fips'] = ""

    # Filter by Zip+4 type
    z4types_mask = (*z4types, 'empty')

    df_filtered_areas = df_all_areas[df_all_areas['z4type'].isin(z4types_mask)]

    # Choose leftmost fips of each effdate
    df_filtered_areas = (
        df_filtered_areas.groupby([df_filtered_areas.index, 'effdate']).first()
    )
    df_filtered_areas = df_filtered_areas.reset_index('effdate')

    # Link previous & next areas as moves
    df_all_moves = create_moves(df_filtered_areas)

    # Filter by county code
    df_filtered_moves = df_all_moves[
        df_all_moves['orig_fips'].str[2:5].isin(county_codes) |
            df_all_moves['dest_fips'].str[2:5].isin(county_codes)
    ]

    df_filtered_moves.to_csv(
        f"data/inter/moves-{lines_per_chunk}/{index}.csv", index=False
    )
    
    # Separate moves
    extract = (
        lambda arrived_start, arrived_stop, left_start, left_stop:
        df_filtered_moves[
            df_filtered_moves['date_arrived'].between(
                arrived_start, arrived_stop, inclusive='left'
            )
            & df_filtered_moves['date_left'].between(
                left_start, left_stop, inclusive='left'
            )
        ][['orig_fips', 'dest_fips']]
    )

    df_08_17_moves = extract(200801, 201301, 201301, 201801)
    df_03_12_moves = extract(200301, 200801, 200801, 201301)
    df_03_17_moves = extract(200301, 200801, 201301, 201801)

    # Write to files
    df_08_17_moves.to_csv(
        f"data/inter/08_17_moves-{lines_per_chunk}/{index}.csv", index=False
    )
    df_03_12_moves.to_csv(
        f"data/inter/03_12_moves-{lines_per_chunk}/{index}.csv", index=False
    )
    df_03_17_moves.to_csv(
        f"data/inter/03_17_moves-{lines_per_chunk}/{index}.csv", index=False
    )

    # Update status
    with open(f"inter_status-{lines_per_chunk}.txt", 'a') as status_file:
        status_file.write(f"{index}\n")

    return f"Chunk {index} done"


for index, chunk in enumerate(it_all_states):
    process_chunk(index, chunk)

In [10]:
### Load all moves ###

def read_chunks(period):
    # Settings
    dtype_all_moves = {
        'orig_fips': 'object',
        'dest_fips': 'object',
    }

    # Read files
    df_moves = pd.concat(
        [
            pd.read_csv(chunk_file, dtype=dtype_all_moves)
            for chunk_file in glob(
                f"data/inter/{period}-{lines_per_chunk}/*.csv"
            )
        ]
    )

    return df_moves


df_08_17_moves = read_chunks("08_17_moves")
df_03_12_moves = read_chunks("03_12_moves")
df_03_17_moves = read_chunks("03_17_moves")

In [11]:
### Data analysis ###

def areas_stayed_count(df_moves):
    se_count = df_moves.groupby('orig_fips').size()

    se_count.index.name = 'fips'

    return se_count

se_08_17_count = areas_stayed_count(df_08_17_moves)
se_03_12_count = areas_stayed_count(df_03_12_moves)
se_03_17_count = areas_stayed_count(df_03_17_moves)

In [12]:
se_08_17_count.to_csv(f"data/inter/08_17_areas_stayed.csv")
se_03_12_count.to_csv(f"data/inter/03_12_areas_stayed.csv")
se_03_17_count.to_csv(f"data/inter/03_17_areas_stayed.csv")

In [15]:
df_summary = pd.DataFrame(columns=['count'])

df_summary.loc["08_17_total"] = se_08_17_count.size
df_summary.loc["03_12_total"] = se_03_12_count.size
df_summary.loc["03_17_total"] = se_03_17_count.size

In [None]:
df_summary.to_csv(f"data/inter/summary.csv")