# Training

In [333]:
using Pkg
deps = ["DataFrames", "CSV", "ScikitLearn", "Statistics", "Flux", "ProgressMeter", "FluxTraining"]
Pkg.add(deps)

In [203]:
using DataFrames, CSV, ScikitLearn, Statistics, Flux

In [156]:
test_df = DataFrame(CSV.File("../../data/test.csv"))

Row,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64?,Float64,Float64,Float64,String15,Float64
1,-118.36,34.06,39.0,2810.0,670.0,1109.0,624.0,3.25,<1H OCEAN,355000.0
2,-119.78,36.78,37.0,2185.0,455.0,1143.0,438.0,1.9784,INLAND,70700.0
3,-122.42,37.73,46.0,1819.0,411.0,1534.0,406.0,4.0132,NEAR BAY,229400.0
4,-122.28,37.81,52.0,340.0,97.0,200.0,87.0,1.5208,NEAR BAY,112500.0
5,-118.13,33.82,37.0,1530.0,290.0,711.0,283.0,5.1795,<1H OCEAN,225400.0
6,-118.16,34.15,17.0,821.0,163.0,229.0,164.0,7.3715,<1H OCEAN,263000.0
7,-120.44,34.91,12.0,3189.0,463.0,1200.0,442.0,5.299,<1H OCEAN,226800.0
8,-122.48,38.31,29.0,2375.0,560.0,1124.0,502.0,2.3276,<1H OCEAN,166200.0
9,-117.64,34.08,35.0,1254.0,241.0,729.0,253.0,3.495,INLAND,118000.0
10,-118.16,34.04,45.0,332.0,70.0,302.0,60.0,3.1895,<1H OCEAN,156300.0


In [197]:
function remove_outliers( df::DataFrame, feat::String, q=0.05 )
    Qi = quantile(df[:,feat], q)
    Qf = quantile(df[:,feat], 1-q)
    IQR = Qf-Qi

    Qi -= 1.5*IQR
    Qf += 1.5*IQR
    
    return df[ (df[:,feat] .> Qi) .& (df[:,feat] .< Qf) , : ]
end

function apply_feature_engineering(df::DataFrame, keep_outliers::Bool)
    df.total_bedrooms .= coalesce.(df.total_bedrooms, mean(skipmissing(df.total_bedrooms)))
    
    custom_encoding = Dict("ISLAND" => 0, "NEAR OCEAN" => 1, "NEAR BAY" => 2, "<1H OCEAN" => 3, "INLAND" => 4)
    df.ocean_proximity_enc = get.(Ref(custom_encoding), df.ocean_proximity, missing)
    select!(df, Not(:ocean_proximity))
    
    df.rooms_per_bedroom = df.total_rooms ./ df.total_bedrooms
    df.rooms_per_household = df.total_rooms ./ df.households
    df.encoded_position = df.longitude + df.latitude
    df.population_per_bedrooms = df.population ./ df.total_bedrooms
    df.target = df.median_house_value
    select!(df, Not(:median_house_value))
    
    if !keep_outliers
        for name in names(df)
            df = remove_outliers(df, name, 0.05)
        end
    end
    
    return df
end

apply_feature_engineering (generic function with 1 method)

In [None]:
function rmse(y_true::AbstractVector, y_pred::AbstractVector)
    return sqrt(mean((y_pred .- y_true) .^ 2))
end

In [166]:
size(test_df)

(4128, 10)

In [None]:
size(apply_feature_engineering(copy(test_df), false))

In [337]:
function split_target(df::DataFrame)
    x = transpose(Matrix(select(df, Not(:target))))
    y = Vector(df.target)
    return x, y
end

split_target (generic function with 1 method)

In [341]:
x_train, y_train = split_target(apply_feature_engineering(copy(test_df), false))

([-118.36 -119.78 … -118.1 -117.38; 34.06 36.78 … 34.09 33.99; … ; -84.3 -83.0 … -84.00999999999999 -83.38999999999999; 1.655223880597015 2.512087912087912 … 2.868512110726644 2.7259036144578315], [355000.0, 70700.0, 229400.0, 112500.0, 225400.0, 263000.0, 226800.0, 166200.0, 118000.0, 156300.0  …  315500.0, 195700.0, 171700.0, 229600.0, 132700.0, 68200.0, 225000.0, 350000.0, 227300.0, 141700.0])

In [343]:
size(y_train)

(4068,)

In [272]:
data = [(x_train[i, :], y_train[i]) for i in 1:size(x_train, 1)]
size(data)
data[1]

([-118.36, 34.06, 39.0, 2810.0, 670.0, 1109.0, 624.0, 3.25, 3.0, 4.1940298507462686, 4.503205128205129, -84.3, 1.655223880597015], 355000.0)

In [314]:
y_train = reshape(y_train, :, 1)
size(y_train)

(4068, 1)

UndefVarError: UndefVarError: `@epochs` not defined

In [363]:
using Pkg
Pkg.add("FluxTraining")
using Flux: @epochs

UndefVarError: UndefVarError: `@epochs` not defined

In [409]:
function build_model(inputs::Int,
    layers::Vector{Int},
    layers_per_dropout::Int=0,
    dropout_rate::Float64=0.0,
    activation_func::Function=Flux.relu
)
    layer_vec = Vector{Any}()
    push!(layer_vec, Flux.Dense(inputs => layers[1], activation_func))

    count = 1
    for i in 2:length(layers)
        push!(layer_vec, Flux.Dense(layers[i-1] => layers[i], activation_func))
        count += 1
        if layers_per_dropout > 0 && count % layers_per_dropout == 0
            push!(layer_vec, Flux.Dropout(dropout_rate))
            count = 0
        end
    end
    push!(layer_vec, Flux.Dense(last(layers) => 1))
    model = Flux.Chain(layer_vec)
    return model
end

function simple_train_model(train_df::DataFrame,
    layers::Vector{Int},
    layers_per_dropout::Int=0,
    dropout_rate::Float64=0.0,
    activation_func::Function=Flux.relu,
    loss_func::Function=Flux.mse,
    optimizer=Flux.ADAM
)

    x_train, y_train = split_target(apply_feature_engineering(train_df, false))
    println("building model")
    model = build_model(size(x_train, 1), layers, layers_per_dropout, dropout_rate, activation_func)
    opt = optimizer()
    y_train = reshape(y_train, 1, :)
    loss(x, y) = loss_func(model(x), y)
    evalcb() = println("RMSE: ", sqrt(mean((model(x_train) .- y_train) .^ 2)))
    for epoch in 1:1000
        Flux.train!(model, [(x_train, y_train)], opt) do m, x, y
            y_hat = model(x)
            loss_func(y_hat, y)
        end
    end
    # Flux.train!(loss, Flux.params(model), [(x_train, y_train)], opt, cb=evalcb)
    # optim = Flux.setup(optimizer(), model)
    # println("training model")
    # loader = Flux.DataLoader((data=x_train, label=y_train), batchsize=64);
    # losses = []
    # for epoch in 1:100
    #     for (x, y) in loader
    #         loss, grads = Flux.withgradient(model) do m
    #             # Evaluate model and loss inside gradient context:
    #             y_hat = m(x)
    #             Flux.crossentropy(y_hat, y)
    #         end
    #         Flux.update!(optim, model, grads[1])
    #         push!(losses, loss)  # logging, outside gradient context
    #     end
    # end

    println("done")
    return model
end

simple_train_model (generic function with 6 methods)

In [414]:
model = simple_train_model(copy(test_df), [64, 32, 16], 2, 0.2)

Chain([
  Array(
    Dense(13 => 64, relu),              [90m# 896 parameters[39m
    Dense(64 => 32, relu),              [90m# 2_080 parameters[39m
    Dropout(0.2),
    Dense(32 => 16, relu),              [90m# 528 parameters[39m
    Dense(16 => 1),                     [90m# 17 parameters[39m
  ),
])[90m                  # Total: 8 arrays, [39m3_521 parameters, 14.230 KiB.

In [415]:
# Assuming x_test is a DataFrame containing the test features and y_test is a vector containing the test targets
# Preprocess the test data similar to how you preprocessed the training data
x_test, y_test = split_target(apply_feature_engineering(copy(test_df), false))

# Evaluate the model on the test data
predictions = model(x_test)

# Compute evaluation metrics
rmse = sqrt(mean((predictions .- y_test) .^ 2))
println("RMSE on test data: ", rmse)

In [416]:
rmse

233486.56675017593

In [413]:
233611.7484814511 > 233580.9707921928

true

(4068, 15)