## Tune parameters for final models

## Code setup

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
%reload_kedro

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV

from augury.ml_estimators import BasicEstimator
from augury.sklearn.metrics import match_accuracy_scorer
from augury.sklearn.model_selection import year_cv_split
from augury.ml_data import MLData
from augury.settings import CV_YEAR_RANGE, SEED

# Pretty arbitrary, but a CV of the ConfidenceEstimator takes about 2 mins,
# so this would run for a bit under 2 hrs for that model.
N_ITER = 50

np.random.seed(SEED)

2022-03-14 09:32:04,179 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-03-14 09:32:04,283 - kedro.config.config - INFO - Config from path `/app/conf/local` will override the following existing top-level config keys: disable_existing_loggers, formatters, handlers, loggers, root, version
2022-03-14 09:32:04,345 - root - INFO - ** Kedro project augury
2022-03-14 09:32:04,352 - root - INFO - Defined global variable `context`, `session` and `catalog`


  from scipy.sparse.base import spmatrix
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
2022-03-14 09:32:04.971862: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-14 09:32:04.971984: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
  from pandas import MultiIndex, Int64Index


In [3]:
data = MLData(train_year_range=(max(CV_YEAR_RANGE),))
data.data

2022-03-14 09:32:09,971 - kedro.io.data_catalog - INFO - Loading data from `full_data` (JSONDataSet)...


Unnamed: 0,Unnamed: 1,Unnamed: 2,team,oppo_team,round_type,venue,prev_match_oppo_team,oppo_prev_match_oppo_team,date,team_goals,team_behinds,score,...,oppo_rolling_prev_match_time_on_ground_skew,oppo_rolling_prev_match_time_on_ground_std,oppo_last_year_brownlow_votes_sum,oppo_last_year_brownlow_votes_max,oppo_last_year_brownlow_votes_min,oppo_last_year_brownlow_votes_skew,oppo_last_year_brownlow_votes_std,oppo_cum_matches_played,oppo_rolling_prev_match_goals_plus_rolling_prev_match_behinds,oppo_rolling_prev_match_goals_divided_by_rolling_prev_match_goals_plus_rolling_prev_match_behinds
Adelaide,1991,1,Adelaide,Hawthorn,Regular,Football Park,0,Melbourne,1991-03-22 03:40:00+00:00,24,11,155,...,0.0,0.0,72,15,0,1.565197,4.070433,80,1,0
Adelaide,1991,2,Adelaide,Carlton,Regular,Football Park,Hawthorn,Fitzroy,1991-03-31 03:40:00+00:00,12,9,81,...,0.0,0.0,51,16,0,2.449132,3.913203,60,1,0
Adelaide,1991,3,Adelaide,Sydney,Regular,S.C.G.,Carlton,Hawthorn,1991-04-07 03:10:00+00:00,19,18,132,...,0.0,0.0,33,7,0,1.403576,2.433862,92,1,0
Adelaide,1991,4,Adelaide,Essendon,Regular,Windy Hill,Sydney,North Melbourne,1991-04-13 03:10:00+00:00,6,11,47,...,0.0,0.0,71,13,0,1.262708,4.524495,69,1,0
Adelaide,1991,5,Adelaide,West Coast,Regular,Subiaco,Essendon,North Melbourne,1991-04-21 05:10:00+00:00,9,11,65,...,0.0,0.0,48,9,0,0.913203,3.218368,48,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Western Bulldogs,2022,19,Western Bulldogs,Melbourne,Regular,Docklands,St Kilda,Port Adelaide,2022-07-22 02:00:00+00:00,0,0,0,...,0.0,0.0,0,0,0,0.000000,0.000000,0,0,0
Western Bulldogs,2022,20,Western Bulldogs,Geelong,Regular,Kardinia Park,Melbourne,Port Adelaide,2022-07-29 02:00:00+00:00,0,0,0,...,0.0,0.0,0,0,0,0.000000,0.000000,0,0,0
Western Bulldogs,2022,21,Western Bulldogs,Fremantle,Regular,Docklands,Geelong,Melbourne,2022-08-05 02:00:00+00:00,0,0,0,...,0.0,0.0,0,0,0,0.000000,0.000000,0,0,0
Western Bulldogs,2022,22,Western Bulldogs,GWS,Regular,Docklands,Fremantle,Essendon,2022-08-12 02:00:00+00:00,0,0,0,...,0.0,0.0,0,0,0,0.000000,0.000000,0,0,0


In [4]:
X_train, y_train = data.train_data

  X_train = self._X.loc[(slice(None), train_year_range, slice(None)), :]


## Tune margin estimator

In [5]:
basic_estimator = BasicEstimator()

[param for param in basic_estimator.get_params() if 'ridge__' in param]

['pipeline__ridge__alpha',
 'pipeline__ridge__copy_X',
 'pipeline__ridge__fit_intercept',
 'pipeline__ridge__max_iter',
 'pipeline__ridge__normalize',
 'pipeline__ridge__positive',
 'pipeline__ridge__random_state',
 'pipeline__ridge__solver',
 'pipeline__ridge__tol']

In [6]:
BASIC_PARAM_GRID = {
    'pipeline__pipeline__correlationselector__threshold': stats.uniform(0.01, 0.1),
    'pipeline__ridge__alpha': stats.uniform(0.0, 1.0),
}

basic_search = RandomizedSearchCV(
    basic_estimator,
    BASIC_PARAM_GRID,
    n_jobs=-1,
    n_iter=N_ITER,
    scoring=match_accuracy_scorer,
    cv=year_cv_split(X_train, CV_YEAR_RANGE),
    random_state=SEED,
    error_score='raise',
    verbose=5,
)

In [7]:
basic_search.fit(*data.train_data)

  X_train = self._X.loc[(slice(None), train_year_range, slice(None)), :]


Fitting 5 folds for each of 50 candidates, totalling 250 fits


2022-03-14 09:33:40.855744: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-14 09:33:40.858077: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-03-14 09:33:40.871149: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-14 09:33:40.871227: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-03-14 09:33:41.175941: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or dire

[CV 1/5] END pipeline__pipeline__correlationselector__threshold=0.047454011884736254, pipeline__ridge__alpha=0.9507143064099162;, score=0.700 total time=   9.9s
[CV 5/5] END pipeline__pipeline__correlationselector__threshold=0.047454011884736254, pipeline__ridge__alpha=0.9507143064099162;, score=0.685 total time=   6.8s
[CV 5/5] END pipeline__pipeline__correlationselector__threshold=0.0831993941811405, pipeline__ridge__alpha=0.5986584841970366;, score=0.667 total time=   6.7s
[CV 4/5] END pipeline__pipeline__correlationselector__threshold=0.025601864044243652, pipeline__ridge__alpha=0.15599452033620265;, score=0.638 total time=   6.2s
[CV 3/5] END pipeline__pipeline__correlationselector__threshold=0.015808361216819947, pipeline__ridge__alpha=0.8661761457749352;, score=0.734 total time=   6.3s
[CV 2/5] END pipeline__pipeline__correlationselector__threshold=0.07011150117432088, pipeline__ridge__alpha=0.7080725777960455;, score=0.662 total time=   5.9s
[CV 1/5] END pipeline__pipeline__cor

[CV 5/5] END pipeline__pipeline__correlationselector__threshold=0.09631034258755936, pipeline__ridge__alpha=0.6232981268275579;, score=0.654 total time=   6.1s[CV 2/5] END pipeline__pipeline__correlationselector__threshold=0.047454011884736254, pipeline__ridge__alpha=0.9507143064099162;, score=0.662 total time=  10.3s
[CV 1/5] END pipeline__pipeline__correlationselector__threshold=0.0831993941811405, pipeline__ridge__alpha=0.5986584841970366;, score=0.700 total time=   6.3s
[CV 4/5] END pipeline__pipeline__correlationselector__threshold=0.0831993941811405, pipeline__ridge__alpha=0.5986584841970366;, score=0.647 total time=   6.2s
[CV 3/5] END pipeline__pipeline__correlationselector__threshold=0.025601864044243652, pipeline__ridge__alpha=0.15599452033620265;, score=0.725 total time=   6.1s
[CV 2/5] END pipeline__pipeline__correlationselector__threshold=0.015808361216819947, pipeline__ridge__alpha=0.8661761457749352;, score=0.652 total time=   6.4s
[CV 1/5] END pipeline__pipeline__correl

[CV 1/5] END pipeline__pipeline__correlationselector__threshold=0.04308980248526492, pipeline__ridge__alpha=0.06355835028602363;, score=0.725 total time=   6.0s[CV 3/5] END pipeline__pipeline__correlationselector__threshold=0.047454011884736254, pipeline__ridge__alpha=0.9507143064099162;, score=0.729 total time=  13.5s
[CV 2/5] END pipeline__pipeline__correlationselector__threshold=0.0831993941811405, pipeline__ridge__alpha=0.5986584841970366;, score=0.652 total time=   6.2s
[CV 1/5] END pipeline__pipeline__correlationselector__threshold=0.025601864044243652, pipeline__ridge__alpha=0.15599452033620265;, score=0.700 total time=   5.8s
[CV 5/5] END pipeline__pipeline__correlationselector__threshold=0.025601864044243652, pipeline__ridge__alpha=0.15599452033620265;, score=0.685 total time=   6.4s
[CV 4/5] END pipeline__pipeline__correlationselector__threshold=0.015808361216819947, pipeline__ridge__alpha=0.8661761457749352;, score=0.633 total time=   6.1s
[CV 3/5] END pipeline__pipeline__co

[CV 2/5] END pipeline__pipeline__correlationselector__threshold=0.04308980248526492, pipeline__ridge__alpha=0.06355835028602363;, score=0.671 total time=   6.2s[CV 4/5] END pipeline__pipeline__correlationselector__threshold=0.047454011884736254, pipeline__ridge__alpha=0.9507143064099162;, score=0.643 total time=  14.4s
[CV 3/5] END pipeline__pipeline__correlationselector__threshold=0.0831993941811405, pipeline__ridge__alpha=0.5986584841970366;, score=0.710 total time=   6.3s
[CV 2/5] END pipeline__pipeline__correlationselector__threshold=0.025601864044243652, pipeline__ridge__alpha=0.15599452033620265;, score=0.647 total time=   5.8s
[CV 1/5] END pipeline__pipeline__correlationselector__threshold=0.015808361216819947, pipeline__ridge__alpha=0.8661761457749352;, score=0.705 total time=   6.2s
[CV 5/5] END pipeline__pipeline__correlationselector__threshold=0.015808361216819947, pipeline__ridge__alpha=0.8661761457749352;, score=0.704 total time=   6.2s
[CV 4/5] END pipeline__pipeline__cor

RandomizedSearchCV(cv=[(array([ True,  True,  True, ..., False, False, False]),
                        array([False, False, False, ..., False, False, False])),
                       (array([ True,  True,  True, ..., False, False, False]),
                        array([False, False, False, ..., False, False, False])),
                       (array([ True,  True,  True, ..., False, False, False]),
                        array([False, False, False, ..., False, False, False])),
                       (array([ True,  True,  True, ..., False, Fal...
                                                                      Ridge(alpha=0.06355835028602363))])),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'pipeline__pipeline__correlationselector__threshold': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fcc4a3e2f40>,
                                        'pipeline__ridge__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fcb6998

In [8]:
basic_search.cv_results_

{'mean_fit_time': array([10.51297765,  6.26004639,  6.00278234,  6.16099505,  5.92352595,
         6.03583083,  5.88890529,  5.97077799,  5.9710701 ,  5.85645952,
         5.89537158,  6.00545549,  6.05515909,  5.96007419,  5.92226524,
         6.87671752,  6.38012271,  6.23187513,  5.97944994,  5.89173775,
         5.95140295,  6.2204433 ,  5.89404006,  5.91430392,  5.9788991 ,
         7.06319513,  6.15450006,  6.00561929,  6.09392581,  6.14034176,
         6.03749809,  6.19113417,  6.17461734,  6.18298707,  6.27769985,
         6.74928975,  6.23625908,  6.04388857,  6.31668139,  6.33990884,
         5.93282728,  6.24924965,  6.55846367,  6.60591245,  6.1962204 ,
         6.87475429,  6.36322846,  6.03172455,  5.99520493,  6.08148861]),
 'std_fit_time': array([2.92670982, 0.17709065, 0.22244269, 0.10067294, 0.06707153,
        0.10200915, 0.12197619, 0.18437406, 0.14342229, 0.11955188,
        0.1351988 , 0.07387074, 0.12459894, 0.17753143, 0.20085854,
        0.75386314, 0.45202312,

In [9]:
basic_results = (
    pd
    .DataFrame(basic_search.cv_results_)
    .sort_values('rank_test_score')
    .filter(regex='mean_test_score|param_')
    .sort_index(axis=1)
)

basic_results.head(20)

Unnamed: 0,mean_test_score,param_pipeline__pipeline__correlationselector__threshold,param_pipeline__ridge__alpha
41,0.694525,0.04309,0.063558
42,0.693559,0.041098,0.325183
23,0.693559,0.041171,0.520068
11,0.693559,0.039214,0.366362
8,0.692593,0.040424,0.524756
18,0.692593,0.040461,0.097672
28,0.689801,0.018849,0.195983
39,0.689694,0.045847,0.115869
32,0.685722,0.038093,0.542696
3,0.685668,0.015808,0.866176


In [10]:
basic_results.to_json('1.0-ridge-param-scores.json', indent=2, orient='records')

In [11]:
basic_results.iloc[0, :].to_dict()

{'mean_test_score': 0.6945249597423511,
 'param_pipeline__pipeline__correlationselector__threshold': 0.04308980248526492,
 'param_pipeline__ridge__alpha': 0.06355835028602363}


[CV 1/5] END pipeline__pipeline__correlationselector__threshold=0.041098232171566225, pipeline__ridge__alpha=0.32518332202674705;, score=0.720 total time=   6.1s
[CV 5/5] END pipeline__pipeline__correlationselector__threshold=0.041098232171566225, pipeline__ridge__alpha=0.32518332202674705;, score=0.685 total time=   7.3s
[CV 4/5] END pipeline__pipeline__correlationselector__threshold=0.0829606178338064, pipeline__ridge__alpha=0.6375574713552131;, score=0.647 total time=   6.9s
[CV 3/5] END pipeline__pipeline__correlationselector__threshold=0.09872127425763265, pipeline__ridge__alpha=0.4722149251619493;, score=0.691 total time=   6.2s
[CV 2/5] END pipeline__pipeline__correlationselector__threshold=0.02195942459383017, pipeline__ridge__alpha=0.713244787222995;, score=0.667 total time=   6.3s
[CV 1/5] END pipeline__pipeline__correlationselector__threshold=0.08607850486168975, pipeline__ridge__alpha=0.5612771975694962;, score=0.700 total time=   7.2s
[CV 5/5] END pipeline__pipeline__corr

  import imp
  from pandas import MultiIndex, Int64Index
  import imp
  from pandas import MultiIndex, Int64Index



[CV 2/5] END pipeline__pipeline__correlationselector__threshold=0.041098232171566225, pipeline__ridge__alpha=0.32518332202674705;, score=0.671 total time=   6.1s
[CV 1/5] END pipeline__pipeline__correlationselector__threshold=0.0829606178338064, pipeline__ridge__alpha=0.6375574713552131;, score=0.700 total time=   6.8s
[CV 5/5] END pipeline__pipeline__correlationselector__threshold=0.0829606178338064, pipeline__ridge__alpha=0.6375574713552131;, score=0.667 total time=   6.7s
[CV 4/5] END pipeline__pipeline__correlationselector__threshold=0.09872127425763265, pipeline__ridge__alpha=0.4722149251619493;, score=0.633 total time=   6.3s
[CV 3/5] END pipeline__pipeline__correlationselector__threshold=0.02195942459383017, pipeline__ridge__alpha=0.713244787222995;, score=0.705 total time=   7.3s
[CV 2/5] END pipeline__pipeline__correlationselector__threshold=0.08607850486168975, pipeline__ridge__alpha=0.5612771975694962;, score=0.657 total time=   6.1s
[CV 1/5] END pipeline__pipeline__correla

  import imp
  from pandas import MultiIndex, Int64Index



[CV 5/5] END pipeline__pipeline__correlationselector__threshold=0.04308980248526492, pipeline__ridge__alpha=0.06355835028602363;, score=0.685 total time=   6.6s
[CV 4/5] END pipeline__pipeline__correlationselector__threshold=0.041098232171566225, pipeline__ridge__alpha=0.32518332202674705;, score=0.652 total time=   7.1s
[CV 3/5] END pipeline__pipeline__correlationselector__threshold=0.0829606178338064, pipeline__ridge__alpha=0.6375574713552131;, score=0.710 total time=   6.3s
[CV 2/5] END pipeline__pipeline__correlationselector__threshold=0.09872127425763265, pipeline__ridge__alpha=0.4722149251619493;, score=0.657 total time=   6.5s
[CV 1/5] END pipeline__pipeline__correlationselector__threshold=0.02195942459383017, pipeline__ridge__alpha=0.713244787222995;, score=0.705 total time=   6.0s
[CV 5/5] END pipeline__pipeline__correlationselector__threshold=0.02195942459383017, pipeline__ridge__alpha=0.713244787222995;, score=0.691 total time=   7.6s
[CV 4/5] END pipeline__pipeline__correl

  import imp
  from pandas import MultiIndex, Int64Index
