In [1]:
using Pkg; Pkg.activate(dirname(pwd()))

[32m[1m  Activating[22m[39m project at `/media/yuehhua/Workbench/workspace/machine-learning.jl`


# Linear Regression

## Using GLM

In [2]:
using GLM
using RDatasets
using MLDataUtils

### Load data

In [3]:
data = RDatasets.dataset("datasets", "mtcars")
first(data, 6)

Unnamed: 0_level_0,Model,MPG,Cyl,Disp,HP,DRat,WT,QSec,VS
Unnamed: 0_level_1,String31,Float64,Int64,Float64,Int64,Float64,Float64,Float64,Int64
1,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0
2,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0
3,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1
4,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1
5,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0
6,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1


### Training/Testing set

In [4]:
indecies = MLDataUtils.shuffleobs(collect(1:nrow(data)))
train_ind, test_ind = MLDataUtils.splitobs(indecies, at = 0.8);

In [5]:
train = data[train_ind, :]
test = data[test_ind, :]

Unnamed: 0_level_0,Model,MPG,Cyl,Disp,HP,DRat,WT,QSec,VS
Unnamed: 0_level_1,String31,Float64,Int64,Float64,Int64,Float64,Float64,Float64,Int64
1,Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1
2,Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1
3,Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0
4,Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1
5,Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18.0,0
6,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1


### Model

In [6]:
ols = GLM.lm(@formula(MPG ~ Cyl + Disp + HP + DRat + WT + QSec + VS + AM + Gear + Carb), train)

StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}

MPG ~ 1 + Cyl + Disp + HP + DRat + WT + QSec + VS + AM + Gear + Carb

Coefficients:
─────────────────────────────────────────────────────────────────────────────
                  Coef.  Std. Error      t  Pr(>|t|)    Lower 95%   Upper 95%
─────────────────────────────────────────────────────────────────────────────
(Intercept)  10.6222     20.7373      0.51    0.6160  -33.5783     54.8227
Cyl          -0.208672    1.13668    -0.18    0.8568   -2.63145     2.2141
Disp          0.0128722   0.0204969   0.63    0.5394   -0.030816    0.0565604
HP           -0.0209727   0.0226596  -0.93    0.3693   -0.0692703   0.027325
DRat          1.41684     1.804       0.79    0.4445   -2.42831     5.26198
WT           -3.44418     1.9818     -1.74    0.1027   -7.66829     0.779925
QSec          0.868452    0.79119     1.10    

### Prediction

In [7]:
predict(ols, test)

6-element Vector{Union{Missing, Float64}}:
 26.19426495836303
 27.250267839860552
 15.586357988718015
 27.671820294485585
 15.761529639900427
 25.575118409077295

### Validation

In [8]:
GLM.r²(ols)

0.8687666958860596

## Using MLJ

In [9]:
using MLJ



### Casting scientific types

In [10]:
y, X = unpack(data[!, 2:end], ==(:MPG), colname -> true);
first(X, 6)

Unnamed: 0_level_0,Cyl,Disp,HP,DRat,WT,QSec,VS,AM,Gear,Carb
Unnamed: 0_level_1,Int64,Float64,Int64,Float64,Float64,Float64,Int64,Int64,Int64,Int64
1,6,160.0,110,3.9,2.62,16.46,0,1,4,4
2,6,160.0,110,3.9,2.875,17.02,0,1,4,4
3,4,108.0,93,3.85,2.32,18.61,1,1,4,1
4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
5,8,360.0,175,3.15,3.44,17.02,0,0,3,2
6,6,225.0,105,2.76,3.46,20.22,1,0,3,1


In [11]:
first(X, 6) |> pretty

┌───────┬────────────┬───────┬────────────┬────────────┬────────────┬───────┬───────┬───────┬───────┐
│[1m Cyl   [0m│[1m Disp       [0m│[1m HP    [0m│[1m DRat       [0m│[1m WT         [0m│[1m QSec       [0m│[1m VS    [0m│[1m AM    [0m│[1m Gear  [0m│[1m Carb  [0m│
│[90m Int64 [0m│[90m Float64    [0m│[90m Int64 [0m│[90m Float64    [0m│[90m Float64    [0m│[90m Float64    [0m│[90m Int64 [0m│[90m Int64 [0m│[90m Int64 [0m│[90m Int64 [0m│
│[90m Count [0m│[90m Continuous [0m│[90m Count [0m│[90m Continuous [0m│[90m Continuous [0m│[90m Continuous [0m│[90m Count [0m│[90m Count [0m│[90m Count [0m│[90m Count [0m│
├───────┼────────────┼───────┼────────────┼────────────┼────────────┼───────┼───────┼───────┼───────┤
│ 6.0   │ 160.0      │ 110.0 │ 3.9        │ 2.62       │ 16.46      │ 0.0   │ 1.0   │ 4.0   │ 4.0   │
│ 6.0   │ 160.0      │ 110.0 │ 3.9        │ 2.875      │ 17.02      │ 0.0   │ 1.0   │ 4.0   │ 4.0   │
│ 4.0   │ 108.0      │ 93.

In [12]:
X = coerce(X, :Cyl => Continuous, :HP => Continuous, :VS => Continuous, :AM => Continuous,
              :Gear => Continuous, :Carb  => Continuous)
first(X, 6)

Unnamed: 0_level_0,Cyl,Disp,HP,DRat,WT,QSec,VS,AM,Gear
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,6.0,160.0,110.0,3.9,2.62,16.46,0.0,1.0,4.0
2,6.0,160.0,110.0,3.9,2.875,17.02,0.0,1.0,4.0
3,4.0,108.0,93.0,3.85,2.32,18.61,1.0,1.0,4.0
4,6.0,258.0,110.0,3.08,3.215,19.44,1.0,0.0,3.0
5,8.0,360.0,175.0,3.15,3.44,17.02,0.0,0.0,3.0
6,6.0,225.0,105.0,2.76,3.46,20.22,1.0,0.0,3.0


### Training/testing set

In [13]:
train, test = partition(eachindex(y), 0.7, shuffle=true)

([5, 23, 4, 31, 16, 17, 22, 2, 13, 9  …  11, 6, 15, 24, 28, 7, 8, 30, 25, 19], [27, 3, 10, 18, 21, 14, 12, 1, 20, 32])

### Model

In [14]:
LinearRegressor = @load LinearRegressor pkg=GLM

import MLJGLMInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /home/yuehhua/.julia/packages/MLJModels/lDzCR/src/loading.jl:168


MLJGLMInterface.LinearRegressor

In [15]:
linreg = machine(LinearRegressor(), X, y)

Machine trained 0 times; caches data
  model: LinearRegressor(fit_intercept = true, …)
  args: 
    1:	Source @257 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @220 ⏎ `AbstractVector{Continuous}`


### Training

In [16]:
fit!(linreg, rows=train)

┌ Info: Training machine(LinearRegressor(fit_intercept = true, …), …).
└ @ MLJBase /home/yuehhua/.julia/packages/MLJBase/rQDaq/src/machines.jl:487


Machine trained 1 time; caches data
  model: LinearRegressor(fit_intercept = true, …)
  args: 
    1:	Source @257 ⏎ `Table{AbstractVector{Continuous}}`
    2:	Source @220 ⏎ `AbstractVector{Continuous}`


### Predict

In [17]:
ŷ = predict_mean(linreg, rows=test)

10-element Vector{Float64}:
 28.62153054334838
 27.39392492338811
 17.936991120926617
 27.412901314079846
 26.327386600712686
 14.459662783350481
 13.674051839325948
 22.329880848527843
 28.376232566857635
 25.77403532271075

### Evaluation

In [18]:
rms(ŷ, y[test])

3.703218087231499