# Final Project

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import time
import pickle

from keras.wrappers.scikit_learn import KerasRegressor
from typing import Final
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV

import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' # Suppress TF AVX info message

## Data Preprocessing

In [None]:
# Build the complete dataset

# Read in the raw data
gdp_raw = pd.read_excel('gdp.xlsx').set_axis(
    [
        'county', '2017', '2018',
        '2019', '2020', 'rank 2018',
        'percent change 2018', 'percent change 2019',
        'percent change 2020', 'rank 2020'
    ],
    axis=1,
    inplace=False
)

gdp_clean: pd.DataFrame = gdp_raw.drop(
    columns=[
        'rank 2018', '2017', '2018', '2020',
        'rank 2020', 'percent change 2018',
        'percent change 2019'
    ],
    inplace=False).iloc[5:3222]

state_names = pd.read_csv('us-counties-2020.csv')['state'].unique()
counties_df = pd.read_csv('complete.csv')

# Initialize empty columns
counties_df['2019 raw GDP'] = np.nan  # iloc =5
counties_df['percent change 2020'] = np.nan  # iloc = 6

# Rearrange the gpd data so that it's ordered by counties
curr_state = None
for index, row in gdp_clean.iterrows():
    if row[0] in state_names:
        curr_state = row[0]
        continue
    else:
        row_index = counties_df.index[(counties_df['state'] == curr_state) & (
            counties_df['county'] == row[0])].tolist()
        counties_df.iloc[[row_index], [5]] = row[1]  # type: ignore
        counties_df.iloc[[row_index], [6]] = row[2]  # type: ignore

# Datatype conversion
counties_df['2019 raw GDP'] = counties_df['2019 raw GDP'].astype('float64')
counties_df['percent change 2020'] = counties_df['percent change 2020'].astype(
    'float64')

print(f"{len(counties_df['state'].unique())} states")
print(counties_df)


In [None]:
# One hot encoding for the state names
encoded_state_names = pd.get_dummies(counties_df['state'])
counties_df = counties_df.drop(columns=['state'])\
                        .join(encoded_state_names)\
                        .dropna(axis='index', how='any')
print(counties_df)

## Building the model

In [None]:
# Setup some constants
DEFAULT_LEARNING_RATE: Final[float] = 0.1
DEFAULT_EPOCHS: Final[int] = 500

In [None]:
def BuildDefaultModel():
    SGD_optimizer: Final = tf.keras.optimizers.SGD(
        learning_rate=DEFAULT_LEARNING_RATE)
    lossFunction = tf.keras.losses.MeanSquaredError()

    model: keras.Sequential = keras.Sequential([
        keras.Input(shape=(59)),
        tf.keras.layers.Dense(
            units=30, activation=tf.nn.relu, name='hidden_layer_1'),
        tf.keras.layers.Dense(
            units=30, activation=tf.nn.relu, name='hidden_layer_2'),
        tf.keras.layers.Dense(
            units=15, activation=tf.nn.relu, name='hidden_layer_3'),
        tf.keras.layers.Dense(
            units=5, activation=tf.nn.relu, name='hidden_layer_4'),
        tf.keras.layers.Dense(units=1, activation=tf.nn.relu, name='output'),
    ], name='Default_COVID_Regressor')

    model.compile(loss=lossFunction, optimizer=SGD_optimizer,
                  metrics=['MeanSquaredError'])
    return model


defaultCovidRegressor = BuildDefaultModel()
print(defaultCovidRegressor.summary())


In [None]:
# Build the input dataset X

input_attributes = counties_df.drop(
    columns=['percent change 2020', 'county'], inplace=False)

input_attributes['2019 raw GDP'] = input_attributes['2019 raw GDP'].astype(
    'float64')
input_attributes['cases'] = input_attributes['cases'].astype('int32')

input_attributes['Positivity Rate'] = input_attributes['cases'] / \
    input_attributes['2020 population']
input_attributes['Death Rate'] = input_attributes['deaths'] / \
    input_attributes['2020 population']

input_attributes.drop(columns=['cases', 'deaths'], inplace=True)

# Scale the raw GDP and population individually after the positivity rates
# Otherwise the rates also get normalized
gdp_scaler = MinMaxScaler()
population_scaler = MinMaxScaler()
input_attributes['2019 raw GDP'] = gdp_scaler.fit_transform(
    np.array(input_attributes['2019 raw GDP']).reshape(-1, 1))
input_attributes['2020 population'] = population_scaler.fit_transform(
    np.array(input_attributes['2020 population']).reshape(-1, 1))
print(input_attributes)


In [None]:
# Normalize y because NN only outputs from 0 to 1
y_scaler = MinMaxScaler()
y_vals = y_scaler.fit_transform(
    np.array(counties_df['percent change 2020']).reshape(-1, 1)
)
y_vals = y_vals.reshape(y_vals.shape[0])


In [None]:
# Train test split
X_Train, X_Test, y_Train, y_Test = train_test_split(
    input_attributes,
    y_vals,
    test_size=0.1,
    random_state=44
)

print(X_Train.shape, y_Train.shape)


In [None]:
# Compile and Train the default model
train_default = False
if train_default:
    startTime = time.time()
    defaultCovidRegressor.fit(
        x=X_Train,
        y=y_Train,
        epochs=DEFAULT_EPOCHS,
        validation_data=(X_Test, y_Test)
    )
    endTime = time.time()

    print(f'Training took {endTime-startTime} seconds')


In [None]:

# raw continuous outputs
if train_default:
    predicted_y: np.ndarray = defaultCovidRegressor.predict(X_Test)
    print(f'MSE of test set is: {mean_squared_error(predicted_y, y_Test)}')


In [None]:
def generateValidationData(inputDF: pd.DataFrame,
                           expectedOutputs: np.ndarray,
                           trainIndexes: np.ndarray,
                           testIndexes: np.ndarray):
    X_Train = inputDF.iloc[trainIndexes]
    X_Test = inputDF.iloc[testIndexes]

    y_Train = expectedOutputs[trainIndexes]
    y_Test = expectedOutputs[testIndexes]

    return X_Train, X_Test, y_Train, y_Test


In [None]:
# Do 10-fold validation
# This is slow so skip if necessary
skip_kfold = True
if not skip_kfold:
    KFolder = KFold(n_splits=10)
    mseScores: list[float] = []
    accuracyScores: list[float] = []

    for trainIndexes, testIndexes in KFolder.split(input_attributes):
        X_Train, X_Test, y_Train, y_Test = generateValidationData(
            input_attributes,
            y_vals,
            trainIndexes,
            testIndexes
        )

        defaultCovidRegressor.fit(
            x=X_Train, y=y_Train,
            epochs=DEFAULT_EPOCHS,
            validation_data=(X_Test, y_Test)
        )

        predicted_y: np.ndarray = defaultCovidRegressor.predict(X_Test)

        mseScores.append(mean_squared_error(y_Test, predicted_y))
        print(f'MSE Loss: {mseScores}\nAverage MSE is: {np.average(mseScores)}')


## RandomSearch Hyperparameter Tuning

In [None]:
# build generic model that accepts parameters

def BuildModel(numNodesLayer1=30,
               numNodesLayer2=30,
               numNodesLayer3=15,
               numNodesLayer4=5,
               learningRate=DEFAULT_LEARNING_RATE):
    SGD_optimizer: Final = tf.keras.optimizers.SGD(learning_rate=learningRate)
    lossFunction: Final = tf.keras.losses.MeanSquaredError()

    model: keras.Sequential = keras.Sequential(
        [
            keras.Input(shape=(59)),
            tf.keras.layers.Dense(
                units=numNodesLayer1, activation=tf.nn.relu, name='hidden_layer_1'),
            tf.keras.layers.Dense(
                units=numNodesLayer2, activation=tf.nn.relu, name='hidden_layer_2'),
            tf.keras.layers.Dense(
                units=numNodesLayer3, activation=tf.nn.relu, name='hidden_layer_3'),
            tf.keras.layers.Dense(
                units=numNodesLayer4, activation=tf.nn.relu, name='hidden_layer_4'),
            tf.keras.layers.Dense(
                units=1,
                activation=tf.nn.relu,
                name='output'
            ),
        ],
        name='COVID_Regressor'
    )

    model.compile(
        loss=lossFunction,
        optimizer=SGD_optimizer,
        metrics=['mse']
    )
    return model


wrappedCovidRegressor = KerasRegressor(build_fn=BuildModel)

# parameters passed to BuildModel(...)
param_grid = dict(
    nb_epoch=np.array([2, list(range(500, 1000))]),
    learningRate=np.array([0.03, 0.05, 0.1, 0.2]),
    numNodesLayer1=np.array([13, 23, 30, 35]),
    numNodesLayer2=np.array([7, 10, 20, 30]),
    numNodesLayer3=np.array([10, 15, 17]),
    numNodesLayer4=np.array([5, 6, 7])
)

random_search = RandomizedSearchCV(estimator=wrappedCovidRegressor,
                                   param_distributions=param_grid,
                                   n_jobs=-1,
                                   cv=10)

# Extremely slow, do not run this repeatedly
random_search_result = random_search.fit(
    X_Train,
    y_Train,
    validation_data=(X_Test, y_Test)
)


In [None]:
tuning_results = dict(random_search_result.best_params_)
print(tuning_results)

best_num_epochs = tuning_results['nb_epoch']
del tuning_results['nb_epoch'] # delete this one so tuning_results can be used as kwargs

In [None]:
print(best_num_epochs)

In [None]:
tunedModel = BuildModel(**tuning_results)

# Compile and Train the default model
startTime = time.time()
history = tunedModel.fit(
    x=X_Train,
    y=y_Train,
    epochs=best_num_epochs,
    validation_data=(X_Test, y_Test)
)
endTime = time.time()

print(f'Training took {endTime-startTime} seconds')


In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('loss.png', dpi=150)
plt.show()

In [None]:
pkl_filename = 'packaged_model.pkl'
with open(pkl_filename, 'wb') as f:
    pickle.dump(tunedModel, f)

gdp_scaler_file = 'gdp_scaler.pkl'
with open(gdp_scaler_file, 'wb') as f:
    pickle.dump(gdp_scaler, f)

population_scaler_file = 'population_scaler.pkl'
with open(population_scaler_file, 'wb') as f:
    pickle.dump(population_scaler, f)

y_scaler_file = 'y_scaler.pkl'
with open(y_scaler_file, 'wb') as f:
    pickle.dump(y_scaler, f)