In [None]:
using Pkg
deps = ["DataFrames", "CSV", "ScikitLearn", "Statistics", "CUDA"]
Pkg.add(deps)

In [None]:
using DataFrames, CSV, ScikitLearn, Statistics, CUDA

In [None]:
test_df = DataFrame(CSV.File("../../data/test.csv"))


In [None]:
function remove_outliers( df::DataFrame, feat::String, q=0.05 )
    Qi = quantile(df[:,feat], q)
    Qf = quantile(df[:,feat], 1-q)
    IQR = Qf-Qi

    Qi -= 1.5*IQR
    Qf += 1.5*IQR
    
    return df[ (df[:,feat] .> Qi) .& (df[:,feat] .< Qf) , : ]
end

function apply_feature_engineering(df::DataFrame, keep_outliers::Bool)
    df.total_bedrooms .= coalesce.(df.total_bedrooms, mean(skipmissing(df.total_bedrooms)))
    
    custom_encoding = Dict("ISLAND" => 4, "NEAR OCEAN" => 3, "NEAR BAY" => 2, "<1H OCEAN" => 1, "INLAND" => 0)
    df.ocean_proximity_enc = get.(Ref(custom_encoding), df.ocean_proximity, missing)
    select!(df, Not(:ocean_proximity))
    
    df.rooms_per_bedroom = df.total_rooms ./ df.total_bedrooms
    df.rooms_per_household = df.total_rooms ./ df.households
    df.encoded_position = df.longitude + df.latitude
    df.population_per_bedrooms = df.population ./ df.total_bedrooms
    df.target = df.median_house_value
    select!(df, Not(:median_house_value))
    
    if !keep_outliers
        for name in names(df)
            df = remove_outliers(df, name, 0.05)
        end
    end
    
    return df
end

In [None]:
function rmse(y_true::AbstractVector, y_pred::AbstractVector)
    return sqrt(mean((y_pred .- y_true) .^ 2))
end

In [None]:
function split_target(df::DataFrame)
    x = transpose(Matrix(select(df, Not(:target))))
    y = Vector(df.target)
    return x, y
end

In [None]:
using ScikitLearn
using ScikitLearn.CrossValidation: train_test_split
using ScikitLearn: fit!, predict

@sk_import neural_network: MLPRegressor
@sk_import metrics: mean_squared_error

function build_model(inputs::Int,
                     layers::Vector{Int},
                     activation_func::String = "relu",
                     loss_func::String = "squared_loss",
                     optimizer::String = "adam"
                     patience::Int = 10)
    
    model = MLPRegressor(hidden_layer_sizes=tuple(layers...),
                         activation=activation_func,
                         solver=optimizer,
                         alpha=0.0,
                         batch_size="auto",
                         learning_rate="constant",
                         learning_rate_init=0.001,
                         max_iter=200,
                         shuffle=true,
                         tol=1e-4,
                         verbose=false,
                         warm_start=false,
                         momentum=0.9,
                         nesterovs_momentum=true,
                         early_stopping=true,
                         validation_fraction=0.1,
                         beta_1=0.9,
                         beta_2=0.999,
                         epsilon=1e-8,
                         n_iter_no_change=patience)
    
    return model
end