In [3]:
import csv
from datetime import datetime
import numpy as np
import pandas as pd


In [2]:
def period_assign(row):
    if row["effdate"] >= 201301 and row["effdate"] < 201801:
        val = 2
    elif row["effdate"] >= 200801 and row["effdate"] < 201301:
        val = 1
    elif row["effdate"] >= 200301 and row["effdate"] < 200801:
        val = 0
    else:
        val = 99
    return val


In [12]:
z4types = ["S", "H", "R"]
long_moves = pd.DataFrame()

status_file = "status_allmoves.txt"
source_file = "data/all_states.csv"

print_line_every = 100
start_line = 1
process_records = 27408937
end_line = start_line + process_records

with open(status_file) as f:
    last_status_line = int(f.read())
# resume_from_status = input(
#     f"Type Yes to resume from last line processed in {status_file} which is: "
#     f"{last_status_line:,}\n"
# )
resume_from_status = "No"
if resume_from_status == "Yes":
    start_line = last_status_line + 1

with open(source_file, "r") as object:
    line_count = start_line
    process_start = datetime.now()
    csv_file = csv.reader(object)

    for row in csv_file:
        df = pd.DataFrame(
            {
                "address_observation": [x + 1 for x in range(10)],
                "pid": [row[0] for _ in range(10)],
                "idate": [row[4] for _ in range(10)],
                "odate": [row[5] for _ in range(10)],
                "z4type": [row[9 * x + 13] for x in range(10)],
                "effdate": [row[9 * x + 14] for x in range(10)],
                "fips": [row[4 * x + 99] for x in range(10)],
            }
        )

        df = df[df.fips != ""]
        df = df[df["z4type"].isin(z4types)]

        df["effdate"] = pd.to_numeric(df["effdate"])
        df["idate"] = pd.to_numeric(df["idate"])
        df["odate"] = pd.to_numeric(df["odate"])
        df["period"] = ""
        df["period"] = df.apply(period_assign, axis=1, result_type="reduce")
        df["seentime"] = abs(df["odate"] - df["idate"])

        # TODO: Remove PO box addresses

        df.sort_values(by=["effdate"], ascending=True, inplace=True)
        df["originfips"] = df.fips.shift(1, fill_value="first record")
        df["prev_effdate"] = df.effdate.shift(1).astype(str)

        df.dropna(axis=0, subset=["originfips"], how="any", inplace=True)
        long_moves = pd.concat([long_moves, df])

        line_count += 1
        if line_count % print_line_every == 0:
            long_moves.to_csv(
                path_or_buf="data/allmoves_v1.csv", mode="a", header=False, index=False
            )
            long_moves.drop(long_moves.index, inplace=True)

            process_duration = datetime.now() - process_start
            rate = print_line_every / (process_duration.total_seconds() / 60)
            estimated_completion = (end_line - line_count) / rate
            print(
                f"Finished: {line_count}, PID: {row[0]} at "
                f"{datetime.now().time()} | rate: {rate:.1f} per min. | this "
                f"batch complete: {estimated_completion:.1f} min"
            )

            process_start = datetime.now()

            with open(status_file, "w") as writer:
                writer.write(str(line_count))


Finished: 100, PID: Y39398460938264 at 13:54:38.211734 | rate: 7104.7 per min. | this batch complete: 3857.9 min


In [None]:
# FIXME: convert data/allmoves.csv to data/method2/allmoves_w_header.csv
df = pd.read_csv("data/method2/allmoves_w_header.csv")
# TODO: Keep all moves in periods 1, 2, 3 that are also into/out of LA or Orange County
df.drop_duplicates(subset="pid", inplace=True)
df.to_csv("data/method2/allmoves_deduped.csv")


In [4]:
fips_dict = {}

with open("fips_tracts_cats.csv", "r") as object:
    csv_reader = csv.reader(object)
    for row in csv_reader:
        k = row[1]
        if row[5] == "1":
            v = "gain"
        elif row[6] == "1":
            v = "loss"
        else:
            v = "other"
        fips_dict[k] = v

gain_list = [k for k, v in fips_dict.items() if v == "gain"]
loss_list = [k for k, v in fips_dict.items() if v == "loss"]


In [None]:
df = pd.read_csv("data/method2/allmoves_deduped.csv")


In [None]:
gain_dict = {}

for i in gain_list:
    df1 = df[df.fips == i]
    df2 = df1[df.period == 0]
    gain_dict[i] = len(df2)

print("Loss List Time")
loss_dict = {}

for i in loss_list:
    df1 = df[df.originfips == i]
    df2 = df1[df.period == 0]
    loss_dict[i] = len(df2)


# XXX: output file
with open("data/losstotal_p0.csv", "w", newline="") as f:
    w = csv.writer(f)
    w.writerows(loss_dict.items())

# XXX: output file
with open("data/gaintotal_p0.csv", "w", newline="") as f:
    w = csv.writer(f)
    w.writerows(gain_dict.items())


In [None]:
df1 = df[df.period == 0]

origins = df1.originfips.unique().tolist()
destinations = df1.fips.unique().tolist()

focus_fips = loss_list + gain_list
all_fips = origins + destinations

my_dict = {i: all_fips.count(i) for i in all_fips}
unique_fips = list(my_dict.keys())


matrix = pd.DataFrame(0, index=unique_fips, columns=unique_fips)

for index, row in df1.iterrows():
    origin = row["originfips"]
    destination = row["fips"]
    matrix.loc[origin, destination] += 1

# XXX: output file
matrix.to_csv("data/matrix_p0.csv")
print(matrix.shape)
print(np.count_nonzero(matrix.index.duplicated("first")))


In [None]:
matrix = pd.read_csv("data/matrix_p0.csv", dtype=str, index_col=0, header=0)

for i in unique_fips:
    matrix[i] = pd.to_numeric(matrix[i], errors="ignore")

df1 = matrix[matrix.index.isin(loss_list)]

for index, row in df1.iterrows():
    df1.loc[df1.index, df1.index] = 0

df1totals = df1.sum(axis=1).astype(int)
# XXX: output file
df1totals.to_csv("data/out_of_high_loss_p0.csv")


df2 = matrix[matrix.index.isin(gain_list)].transpose()

for index, row in df1.iterrows():
    df2.loc[index, index] = 0

df2totals = df2.sum(axis=1).astype(int)

df2totals = df2totals.to_frame()
df2totals.rename(columns={0: "count"}, inplace=True)
df2totals["type"] = "other"

for index, row in df2totals.iterrows():
    df2totals.at[index, "type"] = fips_dict.get(index)

# XXX: output file
df2totals.to_csv("data/into_high_gain_p0.csv")


from_loss = np.sum(df2totals[df2totals.index.isin(loss_list)]["count"])
from_gain = np.sum(df2totals[df2totals.index.isin(gain_list)]["count"])
from_other = np.sum(df2totals["count"]) - from_loss - from_gain

dict_counts = {"from_loss": from_loss, "from_gain": from_gain, "from_other": from_other}
# XXX: output file
with open("data/into_high_gain_summary_p0.csv", "w", newline="") as f:
    w = csv.writer(f)
    w.writerows(dict_counts.items())
