In [1]:
# installing dependencies, remove if already installed
using Pkg
deps = ["DataFrames", "CSV", "ScikitLearn", "Statistics", "CUDA"]
Pkg.add(deps)

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.10/Manifest.toml`


In [2]:
using DataFrames, CSV, ScikitLearn, Statistics, CUDA, Random
@sk_import neural_network: MLPRegressor
@sk_import metrics: mean_squared_error
@sk_import preprocessing: StandardScaler

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `conda install -y -c anaconda conda` in root environment


Channels:
 - anaconda
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRunning `conda install -y -c conda-forge 'libstdcxx-ng>=3.4,<13.0'` in root environment


Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



PyObject <class 'sklearn.preprocessing._data.StandardScaler'>

In [3]:
Random.seed!(0)

TaskLocalRNG()

In [4]:
# initialize datasets
train_df = DataFrame(CSV.File("/tf/notebooks/data/train.csv"))
val_df = DataFrame(CSV.File("/tf/notebooks/data/validation.csv"))
train_df = vcat(train_df, val_df) # merge train and validation data, this is because the split is done by sklearn
test_df = DataFrame(CSV.File("/tf/notebooks/data/test.csv"))

Row,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64?,Float64,Float64,Float64,String15,Float64
1,-118.36,34.06,39.0,2810.0,670.0,1109.0,624.0,3.25,<1H OCEAN,355000.0
2,-119.78,36.78,37.0,2185.0,455.0,1143.0,438.0,1.9784,INLAND,70700.0
3,-122.42,37.73,46.0,1819.0,411.0,1534.0,406.0,4.0132,NEAR BAY,229400.0
4,-122.28,37.81,52.0,340.0,97.0,200.0,87.0,1.5208,NEAR BAY,112500.0
5,-118.13,33.82,37.0,1530.0,290.0,711.0,283.0,5.1795,<1H OCEAN,225400.0
6,-118.16,34.15,17.0,821.0,163.0,229.0,164.0,7.3715,<1H OCEAN,263000.0
7,-120.44,34.91,12.0,3189.0,463.0,1200.0,442.0,5.299,<1H OCEAN,226800.0
8,-122.48,38.31,29.0,2375.0,560.0,1124.0,502.0,2.3276,<1H OCEAN,166200.0
9,-117.64,34.08,35.0,1254.0,241.0,729.0,253.0,3.495,INLAND,118000.0
10,-118.16,34.04,45.0,332.0,70.0,302.0,60.0,3.1895,<1H OCEAN,156300.0


In [5]:
#remove outliers values, q represents the quantile where the data outside them will be removed
function remove_outliers( df::DataFrame, feat::String, q=0.05 )
    Qi = quantile(df[:,feat], q)
    Qf = quantile(df[:,feat], 1-q)
    IQR = Qf-Qi

    Qi -= 1.5*IQR
    Qf += 1.5*IQR
    
    return df[ (df[:,feat] .> Qi) .& (df[:,feat] .< Qf) , : ]
end

function apply_feature_engineering(df::DataFrame, keep_outliers::Bool)
    df.total_bedrooms .= coalesce.(df.total_bedrooms, mean(skipmissing(df.total_bedrooms)))
    
    #encode ocean_proximity column
    custom_encoding = Dict("ISLAND" => 4, "NEAR OCEAN" => 3, "NEAR BAY" => 2, "<1H OCEAN" => 1, "INLAND" => 0)
    df.ocean_proximity_enc = get.(Ref(custom_encoding), df.ocean_proximity, missing)
    select!(df, Not(:ocean_proximity))
    
    df.rooms_per_bedroom = df.total_rooms ./ df.total_bedrooms
    df.rooms_per_household = df.total_rooms ./ df.households
    df.encoded_position = df.longitude + df.latitude
    df.population_per_bedrooms = df.population ./ df.total_bedrooms
    df.target = df.median_house_value
    select!(df, Not(:median_house_value))
    
    if !keep_outliers
        for name in names(df)
            df = remove_outliers(df, name, 0.05)
        end
    end
    
    return df
end

apply_feature_engineering (generic function with 1 method)

In [6]:
function rmse(y_true::AbstractVector, y_pred::AbstractVector)
    return sqrt(mean((y_pred .- y_true) .^ 2))
end

rmse (generic function with 1 method)

In [7]:
# split the dataset into features and target
function split_target(df::DataFrame)
    x = Matrix(select(df, Not(:target)))
    y = Vector(df.target)
    return x, y
end

split_target (generic function with 1 method)

In [8]:
x_train, y_train = split_target(apply_feature_engineering(train_df, false))
x_test, y_test = split_target(apply_feature_engineering(test_df, false))

([-118.36 34.06 … -84.3 1.655223880597015; -119.78 36.78 … -83.0 2.512087912087912; … ; -118.1 34.09 … -84.00999999999999 2.868512110726644; -117.38 33.99 … -83.38999999999999 2.7259036144578315], [355000.0, 70700.0, 229400.0, 112500.0, 225400.0, 263000.0, 226800.0, 166200.0, 118000.0, 156300.0  …  315500.0, 195700.0, 171700.0, 229600.0, 132700.0, 68200.0, 225000.0, 350000.0, 227300.0, 141700.0])

In [9]:
using ScikitLearn

function build_model(layers::Vector{Int},
                     activation_func::String = "relu",
                     loss_func::String = "squared_loss",
                     optimizer::String = "adam",
                     patience::Int = 20)
    
    model = MLPRegressor(hidden_layer_sizes=tuple(layers...),
                         activation=activation_func,
                         solver=optimizer,
                         alpha=0.01, # applied alpha regularization to replace dropout
                         batch_size=16,
                         learning_rate="constant",
                         learning_rate_init=0.001,
                         max_iter=1500,
                         shuffle=true,
                         tol=1e-4,
                         early_stopping=true,
                         validation_fraction=0.25,
                         n_iter_no_change=patience,
                         random_state=0)
    
    return model
end

build_model (generic function with 5 methods)

In [10]:
model = build_model([64, 64, 64, 64, 64, 64])

In [11]:
pipe = ScikitLearn.Pipelines.Pipeline([("scaler", StandardScaler()), ("model", model)])

ScikitLearn.Skcore.Pipeline(Tuple{Any, Any}[("scaler", PyObject StandardScaler()), ("model", PyObject MLPRegressor(alpha=0.01, batch_size=16, early_stopping=True,
             hidden_layer_sizes=(64, 64, 64, 64, 64, 64), max_iter=1500,
             n_iter_no_change=20, random_state=0, validation_fraction=0.25))], Any[PyObject StandardScaler(), PyObject MLPRegressor(alpha=0.01, batch_size=16, early_stopping=True,
             hidden_layer_sizes=(64, 64, 64, 64, 64, 64), max_iter=1500,
             n_iter_no_change=20, random_state=0, validation_fraction=0.25)])

In [12]:
@time fit!(pipe, x_test, y_test)

 21.014252 seconds (212.00 k allocations: 14.795 MiB, 1.10% compilation time: 8% of which was recompilation)


ScikitLearn.Skcore.Pipeline(Tuple{Any, Any}[("scaler", PyObject StandardScaler()), ("model", PyObject MLPRegressor(alpha=0.01, batch_size=16, early_stopping=True,
             hidden_layer_sizes=(64, 64, 64, 64, 64, 64), max_iter=1500,
             n_iter_no_change=20, random_state=0, validation_fraction=0.25))], Any[PyObject StandardScaler(), PyObject MLPRegressor(alpha=0.01, batch_size=16, early_stopping=True,
             hidden_layer_sizes=(64, 64, 64, 64, 64, 64), max_iter=1500,
             n_iter_no_change=20, random_state=0, validation_fraction=0.25)])

In [13]:
# X_test_transformed = transform(pipe, x_test)
y_pred = predict(pipe, x_test)
mse = rmse(y_test, y_pred)
println("Mean Squared Error on Test Set: ", mse)

Mean Squared Error on Test Set: 49454.88049096558


In [14]:
model