In [1]:
using Pkg; Pkg.activate(dirname(pwd()))

[32m[1m  Activating[22m[39m project at `/media/yuehhua/Workbench/workspace/machine-learning.jl`


# LASSO regression

## Using Lasso

In [2]:
using Lasso, RDatasets, MLDataUtils

### Load data

In [3]:
boston = RDatasets.dataset("MASS", "Boston")
first(boston, 6)

Unnamed: 0_level_0,Crim,Zn,Indus,Chas,NOx,Rm,Age,Dis,Rad,Tax
Unnamed: 0_level_1,Float64,Float64,Float64,Int64,Float64,Float64,Float64,Float64,Int64,Int64
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222
6,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222


### Training/Testing set

In [4]:
indecies = MLDataUtils.shuffleobs(collect(1:nrow(boston)))
train_ind, test_ind = MLDataUtils.splitobs(indecies, at=0.8);

In [5]:
train = boston[train_ind, :]
test = boston[test_ind, :];

### Model

In [6]:
model = fit(LassoModel,
    @formula(MedV ~ Crim + Zn + Indus + Chas + NOx + Rm + Age + Dis + Rad + Tax + PTRatio + Black + LStat), train)

StatsModels.TableRegressionModel{LassoModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, MinAICc}, Matrix{Float64}}

MedV ~ Crim + Zn + Indus + Chas + NOx + Rm + Age + Dis + Rad + Tax + PTRatio + Black + LStat

Coefficients:
LassoModel using MinAICc(2) segment of the regularization path.

Coefficients:
────────────────
        Estimate
────────────────
x1    39.4967
x2    -0.120073
x3     0.0318178
x4     0.0
x5     2.17024
x6   -18.5145
x7     3.54879
x8     0.0080528
x9    -1.49308
x10    0.274844
x11   -0.0102754
x12   -1.04853
x13    0.0104902
x14   -0.540704
────────────────


### Prediction

In [7]:
predict(model)

405-element Vector{Float64}:
 32.53857849099273
 23.632595222983763
 11.599903659620871
 25.825700064649414
 35.11601723769182
 20.539997019611256
 28.07197312375719
 19.011570139768313
 11.213263980769621
 28.125833117922525
  4.348740136165993
 17.67074336228826
  8.803637009288598
  ⋮
 16.018228443528656
 14.27605492208346
 22.137998431086817
 22.092603148758755
 43.33041129962012
 23.222972972375217
 22.103324349037766
 21.34457228949575
 21.800164723228875
 20.9988348178448
 18.32003841343808
 22.93331889451278

## Using MLJ

In [8]:
using MLJ



### Casting scientific types

In [9]:
y, X = unpack(boston, ==(:MedV), colname -> true);
first(X, 6)

Unnamed: 0_level_0,Crim,Zn,Indus,Chas,NOx,Rm,Age,Dis,Rad,Tax
Unnamed: 0_level_1,Float64,Float64,Float64,Int64,Float64,Float64,Float64,Float64,Int64,Int64
1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296
2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242
3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242
4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222
5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222
6,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222


In [10]:
first(X, 6) |> pretty

┌────────────┬────────────┬────────────┬───────┬────────────┬────────────┬────────────┬────────────┬───────┬───────┬────────────┬────────────┬────────────┐
│[1m Crim       [0m│[1m Zn         [0m│[1m Indus      [0m│[1m Chas  [0m│[1m NOx        [0m│[1m Rm         [0m│[1m Age        [0m│[1m Dis        [0m│[1m Rad   [0m│[1m Tax   [0m│[1m PTRatio    [0m│[1m Black      [0m│[1m LStat      [0m│
│[90m Float64    [0m│[90m Float64    [0m│[90m Float64    [0m│[90m Int64 [0m│[90m Float64    [0m│[90m Float64    [0m│[90m Float64    [0m│[90m Float64    [0m│[90m Int64 [0m│[90m Int64 [0m│[90m Float64    [0m│[90m Float64    [0m│[90m Float64    [0m│
│[90m Continuous [0m│[90m Continuous [0m│[90m Continuous [0m│[90m Count [0m│[90m Continuous [0m│[90m Continuous [0m│[90m Continuous [0m│[90m Continuous [0m│[90m Count [0m│[90m Count [0m│[90m Continuous [0m│[90m Continuous [0m│[90m Continuous [0m│
├────────────┼────────────┼───────────

In [11]:
X = coerce(X, autotype(X, rules=(:discrete_to_continuous,)))
# X = coerce(X, :Chas => MLJ.Continuous, :Rad => MLJ.Continuous, :Tax => MLJ.Continuous)
first(X, 6)

Unnamed: 0_level_0,Crim,Zn,Indus,Chas,NOx,Rm,Age,Dis,Rad
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0
2,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0
3,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0
4,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0
5,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0
6,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0


### Training/testing set

In [12]:
train, test = partition(eachindex(y), 0.7, shuffle=true)

([61, 434, 196, 315, 322, 10, 407, 336, 24, 408  …  229, 107, 262, 247, 157, 92, 380, 212, 137, 404], [280, 169, 159, 395, 52, 361, 45, 353, 329, 468  …  383, 301, 436, 253, 127, 55, 410, 429, 454, 46])

### Model

In [13]:
LassoRegressor = @load LassoRegressor pkg=MLJLinearModels

import MLJLinearModels ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /home/yuehhua/.julia/packages/MLJModels/lDzCR/src/loading.jl:168


MLJLinearModels.LassoRegressor

In [14]:
lassoreg = machine(LassoRegressor(), X, y)

Machine trained 0 times; caches data
  model: LassoRegressor(lambda = 1.0, …)
  args: 
    1:	Source @094 ⏎ `Table{AbstractVector{ScientificTypesBase.Continuous}}`
    2:	Source @329 ⏎ `AbstractVector{ScientificTypesBase.Continuous}`


### Training

In [15]:
fit!(lassoreg, rows=train)

┌ Info: Training machine(LassoRegressor(lambda = 1.0, …), …).
└ @ MLJBase /home/yuehhua/.julia/packages/MLJBase/rQDaq/src/machines.jl:487
┌ Info: Solver: MLJLinearModels.ProxGrad
│   accel: Bool true
│   max_iter: Int64 1000
│   tol: Float64 0.0001
│   max_inner: Int64 100
│   beta: Float64 0.8
└ @ MLJLinearModels /home/yuehhua/.julia/packages/MLJLinearModels/2qDvV/src/mlj/interface.jl:39
└ @ MLJLinearModels /home/yuehhua/.julia/packages/MLJLinearModels/2qDvV/src/fit/proxgrad.jl:64


Machine trained 1 time; caches data
  model: LassoRegressor(lambda = 1.0, …)
  args: 
    1:	Source @094 ⏎ `Table{AbstractVector{ScientificTypesBase.Continuous}}`
    2:	Source @329 ⏎ `AbstractVector{ScientificTypesBase.Continuous}`


### Predict

In [16]:
ŷ = MLJ.predict(lassoreg, rows=test)

152-element Vector{Float64}:
 26.671513996023723
 21.849543382812108
 26.719503131733372
 21.113372171122823
 25.453576986637902
 27.581244325383782
 21.442855372178006
 26.661752007886495
 19.079380413973173
 16.209645767340657
 22.20591871549242
 19.971842349485957
 29.221298312923555
  ⋮
 24.526264579777322
 28.64089236343882
 15.625917351324317
 31.821782410154835
 10.326721716255493
 26.959361639146003
 11.29194736033635
 26.385215681335982
 15.36622312377272
  9.285411475142029
 23.283538726186602
 19.941354688097885

### Evaluation

In [17]:
rms(ŷ, y[test])

6.596606691416945

### View model parameters

In [18]:
coefs, intercept = fitted_params(lassoreg)
coefs

13-element Vector{Pair{Symbol, Float64}}:
    :Crim => -0.029105646243984793
      :Zn => 0.09599850660966505
   :Indus => -0.03422582081303135
    :Chas => 0.0
     :NOx => 0.0
      :Rm => 1.5935837354983762
     :Age => 0.08982877459047135
     :Dis => -0.0
     :Rad => 0.006334338148471298
     :Tax => -0.00030736211458398004
 :PTRatio => 0.3570978625154244
   :Black => 0.023515008397306626
   :LStat => -0.7621587437933852