In [160]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [161]:
dir_data = os.path.join("data", "processed")

path_train = os.path.join(dir_data, "train.csv")
path_val = os.path.join(dir_data, "validation.csv")
path_test = os.path.join(dir_data, "test.csv")

In [162]:
df_train = pd.read_csv(path_train)
df_val = pd.read_csv(path_val)
df_test = pd.read_csv(path_test)

In [163]:
df_train["mean_violent_prev_3mo"] = df_train[["violent_lag_1", "violent_lag_2",	"violent_lag_3"]].mean(axis=1)
df_val["mean_violent_prev_3mo"] = df_val[["violent_lag_1", "violent_lag_2",	"violent_lag_3"]].mean(axis=1)
df_test["mean_violent_prev_3mo"] = df_test[["violent_lag_1", "violent_lag_2", "violent_lag_3"]].mean(axis=1)

df_train["log_median_household_income"] = np.log(df_train["median_household_income"])
df_val["log_median_household_income"] = np.log(df_val["median_household_income"])
df_test["log_median_household_income"] = np.log(df_test["median_household_income"])

df_train["log_pop_total"] = np.log1p(df_train["pop_total"])
df_val["log_pop_total"] = np.log1p(df_val["pop_total"])
df_test["log_pop_total"] = np.log1p(df_test["pop_total"])

In [164]:
df_train

Unnamed: 0.1,Unnamed: 0,ZCTA,incident_month_timestamp,crime_Non-Violent,crime_Violent,violent_lag_1,violent_lag_2,violent_lag_3,season,month_sin,month_cos,pop_total,median_household_income,gini_index,unemployment_rate,poverty_rate,mean_violent_prev_3mo,log_median_household_income,log_pop_total
0,0,90001,2020-04-01,16.0,25.0,16.0,12.0,38.0,spring,8.660254e-01,-5.000000e-01,56403,60751.0,0.4160,0.096901,0.205254,22.000000,11.014539,10.940295
1,1,90001,2020-05-01,31.0,22.0,25.0,16.0,12.0,spring,5.000000e-01,-8.660254e-01,56403,60751.0,0.4160,0.096901,0.205254,17.666667,11.014539,10.940295
2,2,90001,2020-06-01,29.0,36.0,22.0,25.0,16.0,summer,1.224647e-16,-1.000000e+00,56403,60751.0,0.4160,0.096901,0.205254,21.000000,11.014539,10.940295
3,3,90001,2020-07-01,37.0,27.0,36.0,22.0,25.0,summer,-5.000000e-01,-8.660254e-01,56403,60751.0,0.4160,0.096901,0.205254,27.666667,11.014539,10.940295
4,4,90001,2020-08-01,24.0,29.0,27.0,36.0,22.0,summer,-8.660254e-01,-5.000000e-01,56403,60751.0,0.4160,0.096901,0.205254,28.333333,11.014539,10.940295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4018,6682,91801,2022-02-01,0.0,0.0,0.0,0.0,0.0,winter,8.660254e-01,5.000000e-01,53730,85549.0,0.4561,0.059655,0.114839,0.000000,11.356845,10.891745
4019,6683,91801,2022-03-01,0.0,0.0,0.0,0.0,0.0,spring,1.000000e+00,6.123234e-17,53730,85549.0,0.4561,0.059655,0.114839,0.000000,11.356845,10.891745
4020,6684,91801,2022-04-01,0.0,0.0,0.0,0.0,0.0,spring,8.660254e-01,-5.000000e-01,53730,85549.0,0.4561,0.059655,0.114839,0.000000,11.356845,10.891745
4021,6685,91801,2022-05-01,0.0,0.0,0.0,0.0,0.0,spring,5.000000e-01,-8.660254e-01,53730,85549.0,0.4561,0.059655,0.114839,0.000000,11.356845,10.891745


## Feature Selection and Data Prep

In [165]:
def prep_data(df, features, label):
    # Shuffle
    indices = df.index.values.tolist()
    shuffled_indices = np.random.permutation(indices)
    df_shuffled = df.reindex(shuffled_indices)

    # Drop records with any nulls
    df_shuffled = df_shuffled.dropna()
    
    # Get features and label
    X = df_shuffled[features]
    Y = df_shuffled[label]

    # Features to scale
    scale_cols = [col for col in features if col not in ["month_sin", "month_cos"]]
    
    # Initialize scalers on training data
    X_scaler = ColumnTransformer(
        transformers = [
            ("scaler", StandardScaler(), scale_cols)
            ],
        remainder = "passthrough"
        )
    X_scaler.fit(df_train.dropna()[features])
    Y_scaler = StandardScaler()
    Y_scaler.fit(df_train.dropna()[label])

    # Scale data
    X_std = X_scaler.transform(X)
    Y_std = Y_scaler.transform(Y)

    return X_std, Y_std

In [166]:
features = ["violent_lag_1"]#, "log_median_household_income"] #, "violent_lag_1", "pop_total", "month_sin", "month_cos"]
label = ["crime_Violent"]

X_train, Y_train = prep_data(df_train, features, label)
X_val, Y_val = prep_data(df_val, features, label)
X_test, Y_test = prep_data(df_test, features, label)

In [167]:
X_train.shape

(3780, 1)

### Linear regression

In [168]:
tf.random.set_seed(0)

def build_model(num_features, learning_rate):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(
        units = 1, # output dim
        input_shape = [num_features], # input dim
        use_bias = True, # use a bias (intercept) param
        kernel_initializer = tf.ones_initializer, # initialize params to 1
        bias_initializer = tf.ones_initializer, # initialize bias to 1
    ))
    
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(optimizer = optimizer, loss = "mse")
    
    return model


model_lr = build_model(num_features = X_train.shape[1], learning_rate = 0.01)
fit_model_lr = model_lr.fit(x = X_train, y = Y_train,
                            epochs = 5,
                            validation_data = (X_val, Y_val))

Epoch 1/5
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 607us/step - loss: 0.2725 - val_loss: 0.0668
Epoch 2/5
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 348us/step - loss: 0.0630 - val_loss: 0.0566
Epoch 3/5
[1m  1/119[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 5ms/step - loss: 0.0481

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 374us/step - loss: 0.0615 - val_loss: 0.0563
Epoch 4/5
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 349us/step - loss: 0.0615 - val_loss: 0.0563
Epoch 5/5
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 415us/step - loss: 0.0615 - val_loss: 0.0563


In [169]:
print("Training (standardized) MSE:", model_lr.evaluate(X_train, Y_train, verbose = 0))
print("Test (standardized) MSE:", model_lr.evaluate(X_test, Y_test, verbose = 0))

Training (standardized) MSE: 0.06145239993929863
Test (standardized) MSE: 0.06155939772725105


### Random Forest

Random forest is a little better with population and the month sin/cos transformations, but a bit more overfitting (minor, but if we have generalizable models with the same validation accuracy don't think we would care about the better training performance?)

In [170]:
model_rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=1234,
    n_jobs=-1
)
model_rf.fit(X_train, Y_train.ravel())

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [171]:
Y_pred_train = model_rf.predict(X_train)
mse_train = mean_squared_error(Y_train.ravel(), Y_pred_train)

Y_pred_val = model_rf.predict(X_val)
mse_val = mean_squared_error(Y_val.ravel(), Y_pred_val)

print("Training (standardized) MSE:", mse_train)
print("Training (standardized) MSE:", mse_val)

Training (standardized) MSE: 0.05055689716167523
Training (standardized) MSE: 0.06573242967069454


### Gradient Boosting

In [172]:
model_gb = GradientBoostingRegressor(loss="absolute_error",
                                learning_rate=0.01,
                                n_estimators=100,
                                max_depth = 10, 
                                random_state = 1234,
                                max_features = 5)
model_gb.fit(X_train, Y_train.ravel())

0,1,2
,loss,'absolute_error'
,learning_rate,0.01
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,10
,min_impurity_decrease,0.0


In [173]:
Y_pred_train = model_gb.predict(X_train)
mse_train = mean_squared_error(Y_train.ravel(), Y_pred_train)

Y_pred_val = model_gb.predict(X_val)
mse_val = mean_squared_error(Y_val.ravel(), Y_pred_val)

print("Training (standardized) MSE:", mse_train)
print("Training (standardized) MSE:", mse_val)

Training (standardized) MSE: 0.2930177259223176
Training (standardized) MSE: 0.28661148137092185
