In [None]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras import regularizers
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Concatenate
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import joblib

In [None]:
# regulate the base path for different environments
if (sys.platform.startswith("linux")):
    pathNav = "/"
else:
    pathNav = "\\"

idx = os.path.abspath("").split(pathNav).index("roi-prediction") + 1
base_path = pathNav.join(os.path.abspath("").split(pathNav)[:idx])

In [None]:
def read_dataframe_from_folder(parent_path, file_name):
    return pd.read_csv(base_path + pathNav + parent_path + pathNav + file_name)

In [None]:
# sort dataframe
def sort_dataframe_on_column(df, target_columns, groupby_column=None):
    if (groupby_column == None):
        df = df.sort_values(by=target_columns, ascending=True).apply(lambda a: a[:]).reset_index()
    else:    
        df = df.sort_values(by=target_columns, ascending=True).groupby(groupby_column).apply(lambda a: a[:], include_groups=False).reset_index()
 
    df = df.reset_index()
    df.drop("index", axis=1, inplace=True)
    df.drop("level_0", axis=1, inplace=True)
    return df

In [None]:
def pad_column_with_zeros(df, target_column, desired_length):
    df[target_column] = df[target_column].astype(str).str.zfill(desired_length)
    return df

In [None]:
def plot_training_curves(history):
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    mae = history.history['mae']
    val_mae = history.history['val_mae']
    epochs = range(1, len(loss) + 1)

    # Plot Loss curves
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, loss, 'b-', label='Training Loss')
    plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
    plt.title('Loss Curves')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    # Plot MAE curves
    plt.subplot(1, 2, 2)
    plt.plot(epochs, mae, 'b-', label='Training MAE')
    plt.plot(epochs, val_mae, 'r-', label='Validation MAE')
    plt.title('MAE Curves')
    plt.xlabel('Epochs')
    plt.ylabel('MAE')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

In [None]:
df = read_dataframe_from_folder("datasets_filtered", "preprocessed_dataset.csv")
df = sort_dataframe_on_column(df, ["geoID", "year"])
df = pad_column_with_zeros(df, "geoID", 5)

# static features only for the most recent years
df_static = df[df["year"] == 2023][["geoID", "tax_rate", "4g_st_pct", "median_income", "population"]]

In [None]:
# window sequences
time_series = ["gdp", "unemployment_rate", "zhvi", "gdp_last_year", "unemployment_rate_last_year"]
static_series = ["tax_rate", "4g_st_pct", "median_income", "population"]
target_roi_column = "roi"

years_of_window = 5
time_series_window = []
static_series_window = []
target_roi_window = []

# iterate over each county
for id, county_set in df.groupby("geoID"):
    county_set = county_set.sort_values("year")

    # check if the county has enough 5 years to slide window through
    if len(county_set) < years_of_window + 1:
        continue

    # iterate through 5-years window (2014-2018), target roi in next year 2019
    for index in range(len(county_set) - years_of_window - 1):
        time_series_data = county_set.iloc[index:index + years_of_window][time_series].values
        static_series_data = df_static[df_static["geoID"] == id][static_series].values[0]

        target_roi = county_set.iloc[index + years_of_window][target_roi_column]

        # append the result as 1 window
        time_series_window.append(time_series_data)
        static_series_window.append(static_series_data)
        target_roi_window.append(target_roi)

In [None]:
# convert to numpy array for tensorflow
x_time_series = np.array(time_series_window)
x_static_series = np.array(static_series_window)
y_label_series = np.array(target_roi_window)


In [None]:
# split datasets
x_times_series_train, x_time_series_test, x_static_series_train, x_static_series_test, y_label_series_train, y_label_series_test = train_test_split(
    x_time_series, 
    x_static_series, 
    y_label_series, 
    test_size=0.2, 
    random_state=30
)

# get rid of the extreme tails data
y_label_series_train = np.clip(y_label_series_train, -15, 25)
y_label_series_test = np.clip(y_label_series_test, -15, 25)

In [None]:
# normal standardization for fair treatment between each dataset
time_series_std = StandardScaler()
num_time_series_samples, num_time_series_timesteps, num_time_series_features = x_times_series_train.shape
num_static_series_samples, num_static_series_features = x_static_series_train.shape

# flatten the dataset to 2D
x_times_series_train_flat = x_times_series_train.reshape(-1, num_time_series_features)
x_time_series_test_flat = x_time_series_test.reshape(-1, num_time_series_features)

# Mean and Sigma for dataset on 2D and convert to 3D for LSTM
x_times_series_train_std = time_series_std.fit_transform(x_times_series_train_flat).reshape(num_time_series_samples, num_time_series_timesteps, num_time_series_features)
x_time_series_test_std = time_series_std.transform(x_time_series_test_flat).reshape(x_time_series_test.shape)

# static series version
static_series_std = StandardScaler()
x_static_series_train_std = static_series_std.fit_transform(x_static_series_train)
x_static_series_test_std = static_series_std.transform(x_static_series_test)

joblib.dump(time_series_std, "time_series_std.pkl")
joblib.dump(static_series_std, "static_series_std.pkl")

In [None]:
plt.hist(y_label_series_train, bins=50)
plt.title("ROI Distribution (Training)")
plt.show()

In [None]:
time_series_input = Input(shape=(years_of_window, num_time_series_features))
lstm_layer_ouput = LSTM(128, activation="tanh", dropout=0.3, return_sequences=True)(time_series_input)
lstm = LSTM(64, activation="tanh", dropout=0.2)(lstm_layer_ouput)

static_series_input = Input(shape=(num_static_series_features,))
dense = Dense(32, activation="relu")(static_series_input)

combined_input = Concatenate()([lstm, dense])
output = Dense(1, activation="linear")(combined_input)

model = Model(inputs=[time_series_input, static_series_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.0001), loss="mse", metrics=["mae"])

In [None]:
early_stop = EarlyStopping(
    monitor="val_mae", patience=50, restore_best_weights=True
)

results = model.fit(
    [x_times_series_train_std, x_static_series_train_std],
    y_label_series_train,
    epochs=200,
    batch_size=32,
    validation_data=([x_time_series_test_std, x_static_series_test_std], y_label_series_test),
    callbacks=[early_stop]
)

In [None]:
model.save("v1_model.keras")

In [None]:
plot_training_curves(results)