### Metrics-2: Fit, Score, Save Basic ML Models
Here we fit a Lasso and RandomForest on the topleft and center point tasks, guided by work in random-invest-5.ipynb where we looked for paramater, etc.

In [32]:
import os, sys
import copy as copyroot
import pandas as pd
from IPython.display import display
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from fastai2.basics import *
from fastai2.vision.all import *
%load_ext autoreload
%autoreload 2

from module.mnist_helpers import build_df, eda_fig_1, build_dls
from module.mnist_models import FeatsNet
from module.mnist_metrics import metrics_df

import torch
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Setup

In [33]:
dls_tl = build_dls(target='topleft')
dls_cr = build_dls(target='center')

In [34]:
path = untar_data(URLs.MNIST_TINY)
df = build_df(path)

In [35]:
y_names = ['scalar_pxsum', 'point_topleft_x', 'point_topleft_y',
          'point_center_x', 'point_center_y']

df2 = pd.DataFrame([])
for i, row in df.iterrows():    
    img_np = np.array(Image.open(str(path) + row['fn'])).flatten()
    df2 = pd.concat((df2, pd.Series(img_np)), axis=1)
    
df2 = df2.T
df2.reset_index(inplace=True, drop=True)

df2_y = df[y_names]
df2 = pd.concat((df2, df2_y), axis=1)

df2.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,779,780,781,782,783,scalar_pxsum,point_topleft_x,point_topleft_y,point_center_x,point_center_y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,35867,11,5,15,14
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,29717,9,4,14,13


In [36]:
y_cols = y_names
x_cols = [col for col in df2.columns if col not in y_cols]

X = df2[x_cols]
Y = df2[y_cols]

y_tlx, y_tly, y_crx, y_cry = [Y.iloc[:,i] for i in range(1,5)]

Xt, Xv, yt_tlx, yv_tlx = train_test_split(X,  y_tlx, random_state=0)
_,  _,  yt_tly, yv_tly = train_test_split(X,  y_tly, random_state=0)
_,  _,  yt_crx, yv_crx = train_test_split(X,  y_crx, random_state=0)
_,  _,  yt_cry, yv_cry = train_test_split(X,  y_cry, random_state=0)

### Build Metrics function & Transform Points

In [37]:
point_t = dls_cr.transform[1][1]
scale_t = dls_tl.after_item

point_t, scale_t

(TensorPoint.create: (object,object) -> create ,
 Pipeline: PointScaler -> ToTensor)

In [38]:
def my_scale(x): return scale_t(point_t(x))

In [39]:
def calc_dist(pred, actual):
    dist     = ((pred - actual)**2).sum(1)**0.5
    baseline = ((actual - actual.mean(0))**2).sum(1)**0.5
    
    dist_avg    = dist.mean().item()
    dist_r2     = 1 - (dist.sum() / baseline.sum()).item()
    sqdist_avg  = (dist**2).mean().item()
    sqdist_r2   = 1 - ((dist**2).sum() / (baseline**2).sum()).item()

    return (dist_avg, dist_r2, sqdist_avg, sqdist_r2)

In [40]:
def calc(preds, actuals):    
    data = []
    for pred, actual in zip(preds, actuals):
        pred = my_scale(pred)
        actual = my_scale(actual)
        data.append((
            r2_score(actual, pred),
            mse(actual, pred).item(),
            mae(actual, pred).item(),
            *calc_dist(pred, actual),
        ))
    return data

In [41]:
def sk_metrics_df(preds_v, preds_t, s_model, s_details):
    
    target = ('topleft', 'center')
    actuals = ([yv_tlx.tolist(), yv_tly.tolist()],
               [yv_crx.tolist(), yv_cry.tolist()])
    preds =   (preds_v[:2], 
               preds_v[2:])

    val_v = calc(preds, actuals)

    target = ('topleft', 'center')
    actuals = ([yt_tlx.tolist(), yt_tly.tolist()],
               [yt_crx.tolist(), yt_cry.tolist()])
    preds =   (preds_t[:2], 
               preds_t[2:])

    val_t = calc(preds, actuals)

    cols = ['r2', 'mse', 'mae', 'dist_avg', 'dist_r2',
           'sqdist_avg', 'sqdist_r2']
    
    df_t = pd.DataFrame(val_t, columns=cols)
    df_t['target'] = target
    df_t['split'] = 'train'

    df_v = pd.DataFrame(val_v, columns=cols)
    df_v['target'] = target
    df_v['split'] = 'valid'

    df = pd.concat((df_t, df_v))
    
    df['model']   = s_model
    df['details'] = s_details
    
    start_cols = ['model', 'details','target', 'split']

    col_order = (start_cols + [col for col in df.columns 
                                if col not in start_cols])
    df = df.loc[:,col_order]
    df.reset_index(inplace=True, drop=True)
    
    return df

### Fit Models

In [42]:
yts = (yt_tlx, yt_tly, yt_crx, yt_cry)
yvs = (yv_tlx, yv_tly, yv_crx, yv_cry)

In [43]:
preds_v, preds_t = [],[]
for yt, yv in zip(yts, yvs):
    model = Lasso(alpha=1.0)
    model.fit(Xt, yt)
    preds_t.append(model.predict(Xt))
    preds_v.append(model.predict(Xv))

In [44]:
df_lasso = sk_metrics_df(preds_v, preds_t, 'Lasso', 'alpha=1')
df_lasso

Unnamed: 0,model,details,target,split,r2,mse,mae,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,Lasso,alpha=1,topleft,train,0.909307,0.005253,0.046936,0.074546,0.750168,0.010506,0.91077
1,Lasso,alpha=1,center,train,0.920351,0.001006,0.023437,0.037478,0.7401,0.002012,0.920277
2,Lasso,alpha=1,topleft,valid,0.69245,0.017655,0.081952,0.130459,0.549808,0.03531,0.691744
3,Lasso,alpha=1,center,valid,0.842115,0.001982,0.033595,0.053305,0.629211,0.003964,0.842354


In [45]:
preds_v, preds_t = [],[]
for yt, yv in zip(yts, yvs):
    model = RandomForestRegressor()
    model.fit(Xt, yt)
    preds_t.append(model.predict(Xt))
    preds_v.append(model.predict(Xv))

In [46]:
df_rf = sk_metrics_df(preds_v, preds_t, 'RF', 'default params')
df_rf

Unnamed: 0,model,details,target,split,r2,mse,mae,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,RF,default params,topleft,train,0.972602,0.001585,0.018339,0.030422,0.898043,0.003169,0.973085
1,RF,default params,center,train,0.983154,0.000213,0.009108,0.01502,0.895842,0.000425,0.983143
2,RF,default params,topleft,valid,0.834843,0.009526,0.051479,0.084943,0.706876,0.019053,0.833668
3,RF,default params,center,valid,0.871394,0.00162,0.026063,0.041923,0.708388,0.003241,0.871118


In [48]:
dir_fn = 'assets/metrics-dfs/'

df_lasso.to_csv (dir_fn + 'metrics2-df-lasso.csv', index=False)
df_rf.to_csv    (dir_fn + 'metrics2-df-rf.csv', index=False)