In [1]:
import numpy as np
import pandas as pd
import sys
import os
from datetime import datetime

In [2]:
# regulate the base path for different environments
if (sys.platform.startswith("linux")):
    pathNav = "/"
else:
    pathNav = "\\"

idx = os.path.abspath("").split(pathNav).index("roi-prediction") + 1
base_path = pathNav.join(os.path.abspath("").split(pathNav)[:idx])

In [3]:
def read_dataframe_from_folder(parent_path, file_name):
    return pd.read_csv(base_path + pathNav + parent_path + pathNav + file_name)

In [4]:
def write_dataframe_to_folder(df, file_name, destination_path):
    if (not os.path.exists(base_path + pathNav + destination_path)):
        os.makedirs(base_path + pathNav + destination_path)

    df.to_csv(base_path + pathNav + destination_path + pathNav + file_name, sep=",", index=False, encoding="utf-8", header=True)

In [5]:
# sort dataframe
def sort_dataframe_on_column(df, target_columns, groupby_column=None):
    if (groupby_column == None):
        df = df.sort_values(by=target_columns, ascending=True).apply(lambda a: a[:]).reset_index()
    else:    
        df = df.sort_values(by=target_columns, ascending=True).groupby(groupby_column).apply(lambda a: a[:], include_groups=False).reset_index()
 
    df = df.reset_index()
    df.drop("index", axis=1, inplace=True)
    df.drop("level_0", axis=1, inplace=True)
    return df

In [6]:
def pad_column_with_zeros(df, target_column, desired_length):
    df[target_column] = df[target_column].astype(str).str.zfill(desired_length)
    return df

In [7]:
def clean_fips(df, target_column):
    df[target_column] = df[target_column].astype(str).str.replace('"', '').str.zfill(5)
    return df

In [8]:
# function to rename some certain columns
def rename_columns(df, target_columns, new_values):
    rename_dict = dict(zip(target_columns, new_values))

    return df.rename(columns=rename_dict)

In [9]:
# join columns into dataframe
def left_join_dataframe(df_1, df_2, columns):
    df_1 = pd.merge(df_1, df_2, on=columns)

    return df_1

In [10]:
# read all datasets
df_county_gdp = read_dataframe_from_folder("datasets_filtered", "county_gdp.csv")
df_population = read_dataframe_from_folder("datasets_filtered", "demographic.csv")
df_zhvi = read_dataframe_from_folder("datasets_filtered", "zhvi.csv")
df_FIPS = read_dataframe_from_folder("datasets_filtered", "FIPS_code.csv")
df_median_income = read_dataframe_from_folder("datasets_filtered", "median_income.csv")
df_unemployment_rate = read_dataframe_from_folder("datasets_filtered", "unemployment.csv")
df_property_tax_rate = read_dataframe_from_folder("datasets_filtered", "property_tax.csv")
df_mobile_broadband = read_dataframe_from_folder("datasets_filtered", "mobile_broadband.csv")

In [11]:
# fill some column to have consistent value
df_FIPS = pad_column_with_zeros(df_FIPS, "geoID", 5)
df_county_gdp = clean_fips(df_county_gdp, "geoID")
df_zhvi = pad_column_with_zeros(df_zhvi, "geoID", 5)
df_unemployment_rate = pad_column_with_zeros(df_unemployment_rate, "geoID", 5)
df_mobile_broadband = pad_column_with_zeros(df_mobile_broadband, "geoID", 5)
df_population = pad_column_with_zeros(df_population, "geoID", 5)

# redo geoID object to int64
df_mobile_broadband["geoID"] = pd.to_numeric(df_mobile_broadband["geoID"], errors="coerce")
df_median_income["geoID"] = pd.to_numeric(df_median_income["geoID"], errors="coerce")

# rename
df_population = rename_columns(df_population, ["estimate"], ["population"])
df_median_income = rename_columns(df_median_income, ["estimate"], ["median_income"])

In [12]:
# reshape dataset into yearly data
df_county_gdp_yearly = df_county_gdp.melt(
    id_vars=["geoID", "name"],
    value_vars=[str(year) for year in range(2013, 2024)],
    var_name="year",
    value_name="gdp"
)
df_county_gdp_yearly["year"] = df_county_gdp_yearly["year"].astype(int)

# sort
df_county_gdp_yearly = sort_dataframe_on_column(df_county_gdp_yearly, ["geoID", "year"])

In [13]:
df_zhvi_yearly = df_zhvi.melt(
    id_vars=["geoID", "RegionName"],
    value_vars=[col for col in df_zhvi.columns if col.startswith("20")],
    var_name="date",
    value_name="zhvi"
)
df_zhvi_yearly["year"] = pd.to_datetime(df_zhvi_yearly["date"]).dt.year
df_zhvi_yearly = df_zhvi_yearly.groupby(["geoID", "year"])["zhvi"].mean().reset_index()

In [14]:
df_unemployment_rate["year"] = pd.to_datetime(df_unemployment_rate["date"]).dt.year
df_unemployment_rate_yearly = df_unemployment_rate.groupby(["geoID", "year"])["unemployment_rate"].mean().reset_index()

In [15]:
# clean geoID and year for merging
for df in [df_zhvi_yearly, df_unemployment_rate_yearly, df_county_gdp_yearly]:
    df["geoID"] = df["geoID"].astype(str).str.strip()
    df["year"] = df["year"].astype(int)

for df in [df_property_tax_rate, df_mobile_broadband, df_median_income, df_population]:
    df["geoID"] = df["geoID"].astype(str).str.strip()
    df = pad_column_with_zeros(df, "geoID", 5)

In [17]:
# reshape and merge static datasets
df_static = df_property_tax_rate[["geoID", "tax_rate"]]
df_static = df_static.merge(df_mobile_broadband[["geoID", "4g_st_pct"]], on="geoID", how="left")
df_static = df_static.merge(df_median_income[["geoID", "median_income"]], on="geoID", how="left")
df_static = df_static.merge(df_population[["geoID", "population"]], on="geoID", how="left")

In [18]:
# merge time-series datasets
df_time_series = df_zhvi_yearly
df_time_series = df_time_series.merge(df_unemployment_rate_yearly, on=["geoID", "year"], how="outer")
df_time_series = df_time_series.merge(df_county_gdp_yearly, on=["geoID", "year"], how="outer")

In [19]:
# merge static file with time series file
df_all_data = pd.merge(df_static, df_time_series, on="geoID", how="left")

In [20]:
# calculate ROI on each year and put in file
df_all_data["roi"] = (df_all_data.groupby("geoID")["zhvi"].transform(lambda x: x.ffill().pct_change(fill_method=None)) * 100)

# put in the previous years' gdp and unemployment
df_all_data["gdp_last_year"] = df_all_data.groupby("geoID")["gdp"].shift(1)
df_all_data["unemployment_rate_last_year"] = df_all_data.groupby("geoID")["unemployment_rate"].shift(1)

In [21]:
# drop counties that has no value on unemployment rate
problem_counties = ['Petersburg Borough, AK*', 'LaSalle, LA', 'Doña Ana, NM']
df_all_data = df_all_data[~df_all_data['name'].isin(problem_counties)]

# imputation of missing data
df_all_data["gdp"] = df_all_data.groupby("geoID")["gdp"].ffill()
df_all_data["zhvi"] = df_all_data.groupby("geoID")["zhvi"].ffill()
df_all_data["unemployment_rate"] = df_all_data.groupby("geoID")["unemployment_rate"].transform(lambda x: x.interpolate().fillna(x.mean()))

df_all_data = df_all_data.dropna(subset=["roi"])

# drop rows that does not have county info
df_all_data = df_all_data.dropna(subset=["name", "gdp"], how="all")


df_all_data["year"] = df_all_data["year"].astype(int)

In [22]:
write_dataframe_to_folder(df_all_data, "preprocessed_dataset.csv", "datasets_filtered")