# Training

In [1]:
using Pkg
deps = ["DataFrames", "CSV", "ScikitLearn", "Statistics"]
Pkg.add(deps)

[32m[1m  Installing[22m[39m known registries into `C:\Users\tcivi\.julia`
[32m[1m    Updating[22m[39m registry at `C:\Users\tcivi\.julia\registries\General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m IrrationalConstants ───────── v0.2.2
[32m[1m   Installed[22m[39m ScikitLearnBase ───────────── v0.5.0
[32m[1m   Installed[22m[39m Conda ─────────────────────── v1.10.0
[32m[1m   Installed[22m[39m Crayons ───────────────────── v4.1.1
[32m[1m   Installed[22m[39m JSON ──────────────────────── v0.21.4
[32m[1m   Installed[22m[39m Preferences ───────────────── v1.4.3
[32m[1m   Installed[22m[39m IterTools ─────────────────── v1.10.0
[32m[1m   Installed[22m[39m TableTraits ───────────────── v1.0.1
[32m[1m   Installed[22m[39m Tables ────────────────────── v1.11.1
[32m[1m   Installed[22m[39m Parsers ───────────────────── v2.8.1
[32m[1m   Installed[22m[39m PyCall ────────────────────── v1.96.4
[32m[1m   In

In [3]:
using DataFrames, CSV, ScikitLearn, Statistics

In [156]:
test_df = DataFrame(CSV.File("../../data/test.csv"))

Row,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64?,Float64,Float64,Float64,String15,Float64
1,-118.36,34.06,39.0,2810.0,670.0,1109.0,624.0,3.25,<1H OCEAN,355000.0
2,-119.78,36.78,37.0,2185.0,455.0,1143.0,438.0,1.9784,INLAND,70700.0
3,-122.42,37.73,46.0,1819.0,411.0,1534.0,406.0,4.0132,NEAR BAY,229400.0
4,-122.28,37.81,52.0,340.0,97.0,200.0,87.0,1.5208,NEAR BAY,112500.0
5,-118.13,33.82,37.0,1530.0,290.0,711.0,283.0,5.1795,<1H OCEAN,225400.0
6,-118.16,34.15,17.0,821.0,163.0,229.0,164.0,7.3715,<1H OCEAN,263000.0
7,-120.44,34.91,12.0,3189.0,463.0,1200.0,442.0,5.299,<1H OCEAN,226800.0
8,-122.48,38.31,29.0,2375.0,560.0,1124.0,502.0,2.3276,<1H OCEAN,166200.0
9,-117.64,34.08,35.0,1254.0,241.0,729.0,253.0,3.495,INLAND,118000.0
10,-118.16,34.04,45.0,332.0,70.0,302.0,60.0,3.1895,<1H OCEAN,156300.0


In [197]:
function remove_outliers( df::DataFrame, feat::String, q=0.05 )
    Qi = quantile(df[:,feat], q)
    Qf = quantile(df[:,feat], 1-q)
    IQR = Qf-Qi

    Qi -= 1.5*IQR
    Qf += 1.5*IQR
    
    return df[ (df[:,feat] .> Qi) .& (df[:,feat] .< Qf) , : ]
end

function apply_feature_engineering(df::DataFrame, keep_outliers::Bool)
    df.total_bedrooms .= coalesce.(df.total_bedrooms, mean(skipmissing(df.total_bedrooms)))
    
    custom_encoding = Dict("ISLAND" => 0, "NEAR OCEAN" => 1, "NEAR BAY" => 2, "<1H OCEAN" => 3, "INLAND" => 4)
    df.ocean_proximity_enc = get.(Ref(custom_encoding), df.ocean_proximity, missing)
    select!(df, Not(:ocean_proximity))
    
    df.rooms_per_bedroom = df.total_rooms ./ df.total_bedrooms
    df.rooms_per_household = df.total_rooms ./ df.households
    df.encoded_position = df.longitude + df.latitude
    df.population_per_bedrooms = df.population ./ df.total_bedrooms
    df.target = df.median_house_value
    select!(df, Not(:median_house_value))
    
    if !keep_outliers
        for name in names(df)
            df = remove_outliers(df, name, 0.05)
        end
    end
    
    return df
end

apply_feature_engineering (generic function with 1 method)

In [166]:
size(test_df)

(4128, 10)

In [None]:
size(apply_feature_engineering(copy(test_df), false))

In [198]:
function split_target(df::DataFrame)
    x_train = select(df, Not(:target))
    y_train = df.target
    return x_train, y_train
end

split_target (generic function with 1 method)

In [199]:
x_train, y_train = split_target(apply_feature_engineering(copy(test_df), false))

([1m4068×13 DataFrame[0m
[1m  Row [0m│[1m longitude [0m[1m latitude [0m[1m housing_median_age [0m[1m total_rooms [0m[1m total_bedrooms [0m[1m [0m ⋯
      │[90m Float64   [0m[90m Float64  [0m[90m Float64            [0m[90m Float64     [0m[90m Float64        [0m[90m [0m ⋯
──────┼─────────────────────────────────────────────────────────────────────────
    1 │   -118.36     34.06                39.0       2810.0           670.0   ⋯
    2 │   -119.78     36.78                37.0       2185.0           455.0
    3 │   -122.42     37.73                46.0       1819.0           411.0
    4 │   -122.28     37.81                52.0        340.0            97.0
    5 │   -118.13     33.82                37.0       1530.0           290.0   ⋯
    6 │   -118.16     34.15                17.0        821.0           163.0
    7 │   -120.44     34.91                12.0       3189.0           463.0
    8 │   -122.48     38.31                29.0       2375.0           56

In [200]:
x_train

Row,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_enc,rooms_per_bedroom,rooms_per_household,encoded_position,population_per_bedrooms
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Int64,Float64,Float64,Float64,Float64
1,-118.36,34.06,39.0,2810.0,670.0,1109.0,624.0,3.25,3,4.19403,4.50321,-84.3,1.65522
2,-119.78,36.78,37.0,2185.0,455.0,1143.0,438.0,1.9784,4,4.8022,4.98858,-83.0,2.51209
3,-122.42,37.73,46.0,1819.0,411.0,1534.0,406.0,4.0132,2,4.42579,4.4803,-84.69,3.73236
4,-122.28,37.81,52.0,340.0,97.0,200.0,87.0,1.5208,2,3.50515,3.90805,-84.47,2.06186
5,-118.13,33.82,37.0,1530.0,290.0,711.0,283.0,5.1795,3,5.27586,5.40636,-84.31,2.45172
6,-118.16,34.15,17.0,821.0,163.0,229.0,164.0,7.3715,3,5.03681,5.0061,-84.01,1.40491
7,-120.44,34.91,12.0,3189.0,463.0,1200.0,442.0,5.299,3,6.88769,7.21493,-85.53,2.59179
8,-122.48,38.31,29.0,2375.0,560.0,1124.0,502.0,2.3276,3,4.24107,4.73108,-84.17,2.00714
9,-117.64,34.08,35.0,1254.0,241.0,729.0,253.0,3.495,4,5.20332,4.95652,-83.56,3.0249
10,-118.16,34.04,45.0,332.0,70.0,302.0,60.0,3.1895,3,4.74286,5.53333,-84.12,4.31429


(4068, 15)