### Metrics-4: Basic ML Models on FeatsNet features
RandomForest and Lasso trained on additional features generated by `FeatsNet.build_features()`.

Compare perf boost against Metrics-2.ipynb where only raw pixel values are available. Compare difference in perf against research-customnn-module-6b.ipynb to see these same features fit on a shallow NN.

Ultimately, we find that there are perfect features for the topleft task in the pts12 and pts22 series. 

Even with these perfect features, when we have too many features (all the pixel values), linear models are unable to find the optimal model. We see this in the first model fit section. However RF's are able to fit close to perfect.

In the final section we strip out the pixel values from the features in X leaving only the pts features, then we can fit an optimal model using even vanilla linear regression. Note: in this situation, Ridge becomes stronger than Lasso.

In [42]:
import os, sys
import copy as copyroot
import pandas as pd
from IPython.display import display
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from fastai2.basics import *
from fastai2.vision.all import *
%load_ext autoreload
%autoreload 2

from module.mnist_helpers import build_df, eda_fig_1, build_dls
from module.mnist_models import FeatsNet
from module.mnist_metrics import metrics_df

import torch
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Setup

In [43]:
dls_tl = build_dls(target='topleft')
dls_cr = build_dls(target='center')

In [44]:
path = untar_data(URLs.MNIST_TINY)
df = build_df(path)

In [45]:
# df2 - pixel values
df2 = pd.DataFrame([])
for i, row in df.iterrows():    
    img_np = np.array(Image.open(str(path) + row['fn']))
    df2 = pd.concat((df2, pd.Series(img_np.flatten())), axis=1)
df2 = df2.T
df2.reset_index(inplace=True, drop=True)

In [46]:
# df3 - pts features

feats = ['pts11','pts12','pts22']
featsnet = FeatsNet(feats = feats)

# 2*(4 + 16 + 16)
cols =  []
cols += [f'pts11_{i}' for i in range(8)]
cols += [f'pts12_{i}' for i in range(32)]
cols += [f'pts22_{i}' for i in range(32)]

df3 = pd.DataFrame([])
for i, row in df.iterrows():    
    img_np = np.array(Image.open(str(path) + row['fn']))
    tmp = featsnet.build_feats(
                torch.tensor(img_np).unsqueeze(0)
                ).squeeze(0).tolist()
    df3 = pd.concat((df3, pd.Series(tmp)), axis=1)
df3 = df3.T
df3.reset_index(inplace=True, drop=True)
df3.columns = cols

In [47]:
y_names = ['scalar_pxsum', 'point_topleft_x', 'point_topleft_y',
          'point_center_x', 'point_center_y']

df2_y = df[y_names]
df2 = pd.concat((df2, df3, df2_y), axis=1)

df2.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,pts22_27,pts22_28,pts22_29,pts22_30,pts22_31,scalar_pxsum,point_topleft_x,point_topleft_y,point_center_x,point_center_y
0,0,0,0,0,0,0,0,0,0,0,...,0.428571,0.214286,0.392857,0.214286,0.214286,35867,11,5,15,14
1,0,0,0,0,0,0,0,0,0,0,...,0.607143,0.642857,0.321429,0.642857,0.464286,29717,9,4,14,13


In [48]:
y_cols = y_names
x_cols = [col for col in df2.columns if col not in y_cols]

X = df2[x_cols]
Y = df2[y_cols]

y_tlx, y_tly, y_crx, y_cry = [Y.iloc[:,i] for i in range(1,5)]

Xt, Xv, yt_tlx, yv_tlx = train_test_split(X,  y_tlx, random_state=0)
_,  _,  yt_tly, yv_tly = train_test_split(X,  y_tly, random_state=0)
_,  _,  yt_crx, yv_crx = train_test_split(X,  y_crx, random_state=0)
_,  _,  yt_cry, yv_cry = train_test_split(X,  y_cry, random_state=0)

### Helper Funcs

In [49]:
point_t = dls_cr.transform[1][1]
scale_t = dls_tl.after_item

point_t, scale_t

(TensorPoint.create: (object,object) -> create ,
 Pipeline: PointScaler -> ToTensor)

In [50]:
def my_scale(x): return scale_t(point_t(x))

In [51]:
def calc_dist(pred, actual):
    dist     = ((pred - actual)**2).sum(1)**0.5
    baseline = ((actual - actual.mean(0))**2).sum(1)**0.5
    
    dist_avg    = dist.mean().item()
    dist_r2     = 1 - (dist.sum() / baseline.sum()).item()
    sqdist_avg  = (dist**2).mean().item()
    sqdist_r2   = 1 - ((dist**2).sum() / (baseline**2).sum()).item()

    return (dist_avg, dist_r2, sqdist_avg, sqdist_r2)

In [52]:
def calc(preds, actuals):    
    data = []
    for pred, actual in zip(preds, actuals):
        pred = my_scale(pred)
        actual = my_scale(actual)
        data.append((
            r2_score(actual, pred),
            mse(actual, pred).item(),
            mae(actual, pred).item(),
            *calc_dist(pred, actual),
        ))
    return data

In [53]:
def sk_metrics_df(preds_v, preds_t, s_model, s_details):
    
    target = ('topleft', 'center')
    actuals = ([yv_tlx.tolist(), yv_tly.tolist()],
               [yv_crx.tolist(), yv_cry.tolist()])
    preds =   (preds_v[:2], 
               preds_v[2:])

    val_v = calc(preds, actuals)

    target = ('topleft', 'center')
    actuals = ([yt_tlx.tolist(), yt_tly.tolist()],
               [yt_crx.tolist(), yt_cry.tolist()])
    preds =   (preds_t[:2], 
               preds_t[2:])

    val_t = calc(preds, actuals)

    cols = ['r2', 'mse', 'mae', 'dist_avg', 'dist_r2',
           'sqdist_avg', 'sqdist_r2']
    
    df_t = pd.DataFrame(val_t, columns=cols)
    df_t['target'] = target
    df_t['split'] = 'train'

    df_v = pd.DataFrame(val_v, columns=cols)
    df_v['target'] = target
    df_v['split'] = 'valid'

    df = pd.concat((df_t, df_v))
    
    df['model']   = s_model
    df['details'] = s_details
    
    start_cols = ['model', 'details','target', 'split']

    col_order = (start_cols + [col for col in df.columns 
                                if col not in start_cols])
    df = df.loc[:,col_order]
    df.reset_index(inplace=True, drop=True)
    
    return df

### Fit Models
These won't come out so well because there's too many features (when only two of 900ish features are neccesary to fit both X and Y for topleft).

See section below for reducing features to get a perfect fit.

In [13]:
yts = (yt_tlx, yt_tly, yt_crx, yt_cry)
yvs = (yv_tlx, yv_tly, yv_crx, yv_cry)

In [14]:
preds_v, preds_t = [],[]
for yt, yv in zip(yts, yvs):
    model = Lasso(alpha=1.0)
    model.fit(Xt, yt)
    preds_t.append(model.predict(Xt))
    preds_v.append(model.predict(Xv))

In [15]:
# [yvs[i][:3] for i in range(4)]
# [preds_v[i][:3] for i in range(4)]

In [16]:
df_lasso = sk_metrics_df(preds_v, preds_t, 'Lasso Feats', 'alpha=1; feats 11,12,22')
df_lasso

Unnamed: 0,model,details,target,split,r2,mse,mae,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,Lasso Feats,"alpha=1; feats 11,12,22",topleft,train,0.909307,0.005253,0.046936,0.074546,0.750168,0.010506,0.91077
1,Lasso Feats,"alpha=1; feats 11,12,22",center,train,0.920351,0.001006,0.023437,0.037478,0.7401,0.002012,0.920277
2,Lasso Feats,"alpha=1; feats 11,12,22",topleft,valid,0.69245,0.017655,0.081952,0.130459,0.549808,0.03531,0.691744
3,Lasso Feats,"alpha=1; feats 11,12,22",center,valid,0.842115,0.001982,0.033595,0.053305,0.629211,0.003964,0.842354


In [17]:
preds_v, preds_t = [],[]
for yt, yv in zip(yts, yvs):
    model = RandomForestRegressor()
    model.fit(Xt, yt)
    preds_t.append(model.predict(Xt))
    preds_v.append(model.predict(Xv))

In [18]:
df_rf = sk_metrics_df(preds_v, preds_t, 'RF Feats', 'feats 11,12,22')
df_rf

Unnamed: 0,model,details,target,split,r2,mse,mae,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,RF Feats,"feats 11,12,22",topleft,train,0.999867,8e-06,0.000385,0.000756,0.997466,1.6e-05,0.99986
1,RF Feats,"feats 11,12,22",center,train,0.997096,3.7e-05,0.002768,0.004838,0.966448,7.3e-05,0.997096
2,RF Feats,"feats 11,12,22",topleft,valid,0.999743,1.5e-05,0.00062,0.00124,0.995721,3.1e-05,0.999734
3,RF Feats,"feats 11,12,22",center,valid,0.977186,0.000281,0.008345,0.014313,0.900437,0.000562,0.977648


In [19]:
df = pd.concat((df_rf, df_lasso))
df.to_csv('assets/metrics-dfs/metrics4-rflasso.csv', index=False)

### Investigate hand match
Is there a particular feature that works perfectly? Yes, many (for topleft). e.g.:

`pts22_5` matches x | `pts11_0` matches y



So why don't we find these with our models?

Maybe Vanilla Linear Regression would work well here?



In [20]:
pts_cols = [c for c in df2.columns if 'pts' in str(c)]
x = df2.loc[:,pts_cols]

def restore_int(x): return round(x*28,0)
x = x.applymap(restore_int)
for col in x:
    x[col] = x[col].astype('int', )

print(x.shape)
x[:2]

(709, 72)


Unnamed: 0,pts11_0,pts11_1,pts11_2,pts11_3,pts11_4,pts11_5,pts11_6,pts11_7,pts12_0,pts12_1,...,pts22_22,pts22_23,pts22_24,pts22_25,pts22_26,pts22_27,pts22_28,pts22_29,pts22_30,pts22_31
0,5,5,5,22,24,5,24,22,5,21,...,20,6,6,21,6,12,6,11,6,6
1,4,7,4,19,23,7,23,19,4,14,...,5,13,18,14,18,17,18,9,18,13


In [21]:
tl_cols = [c for c in df2.columns if 'topleft' in str(c)]
y = df2.loc[:, tl_cols]
y[:2]

Unnamed: 0,point_topleft_x,point_topleft_y
0,11,5
1,9,4


In [22]:
pd.DataFrame({col: [sum(x[col] == y.loc[:,'point_topleft_x'])] for col in x}
            ).T.sort_values(by=0, ascending=False)[:3]

Unnamed: 0,0
pts22_5,709
pts22_29,709
pts22_21,709


In [23]:
pd.DataFrame({col: [sum(x[col] == y.loc[:,'point_topleft_y'])] for col in x}
            ).T.sort_values(by=0, ascending=False)[:3]

Unnamed: 0,0
pts11_0,709
pts12_4,709
pts11_2,709


In [24]:
# Verify
pd.concat((x.loc[:,['pts12_13', 'pts11_0',]] , y), axis=1)

Unnamed: 0,pts12_13,pts11_0,point_topleft_x,point_topleft_y
0,11,5,11,5
1,9,4,9,4
2,12,3,12,3
3,13,4,13,4
4,5,4,5,4
...,...,...,...,...
704,17,7,17,7
705,5,7,5,7
706,9,7,9,7
707,3,8,3,8


## Try fitting the models perfectly
Below, we're using the pts features and basic ML models to acheive the perfect fit we know we're capable of. This will involve tuning the model params sometimes and restricting the features we input.

Also, we've brought in vanilla LR and Ridge regression to join Lasso and RF.

In [25]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [26]:
yts = (yt_tlx, yt_tly, yt_crx, yt_cry)
yvs = (yv_tlx, yv_tly, yv_crx, yv_cry)

In [27]:
# target = topleft_x
yt = yts[0]
yv = yvs[0]

##### First Try

In [28]:
model = LinearRegression()
model.fit(Xt, yt)
model.score(Xt, yt), model.score(Xv, yv)

(1.0, -1.4373947906617475)

In [29]:
model = Ridge()
model.fit(Xt, yt)
model.score(Xt, yt), model.score(Xv, yv)

(0.9989736105991184, -17.24535466155887)

In [30]:
model = Lasso()
model.fit(Xt, yt)
model.score(Xt, yt), model.score(Xv, yv)

(0.8512759976961152, 0.4455662968177406)

##### Reduce Feature Set
It works!

In [31]:
cols2 = ['pts22_5', 'pts22_4']
Xt2, Xv2 = Xt.loc[:,cols2], Xv.loc[:,cols2]

In [32]:
model = LinearRegression()
model.fit(Xt2, yt)
model.score(Xt2, yt), model.score(Xv2, yv)

(0.9999999999999952, 0.9999999999999949)

##### Is the problem, colinearity?

In [33]:
cols2 = [c for c in Xt.columns if 'pts22' in str(c)]
Xt2, Xv2 = Xt.loc[:,cols2], Xv.loc[:,cols2]

In [34]:
model = LinearRegression()
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

model = Lasso(alpha=1.0)
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

model = Lasso(alpha=0.2)
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

model = Ridge()
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

0.9999999999999953 0.9999999999999949
0.0 -8.716758684901293e-05
0.8120363818888715 0.812019999853139
0.9988914267023197 0.9987893886177783


##### Push it further...

In [35]:
cols2 = [c for c in Xt.columns if 'pts' in str(c)]
Xt2, Xv2 = Xt.loc[:,cols2], Xv.loc[:,cols2]

model = LinearRegression()
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

model = Lasso(alpha=1.0)
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

model = Lasso(alpha=0.2)
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

model = Ridge()
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

0.9999999999999953 0.9999999999999949
0.0 -8.716758684901293e-05
0.8120363818888715 0.812019999853139
0.9994463534516745 0.9993844365959731


##### Push all the way
OK back to bad perfromance, when we include the pixel values

In [36]:
cols2 = [c for c in Xt.columns if True]
Xt2, Xv2 = Xt.loc[:,cols2], Xv.loc[:,cols2]

model = LinearRegression()
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

model = Lasso(alpha=1.0)
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

model = Lasso(alpha=0.2)
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

model = Ridge()
model.fit(Xt2, yt)
print(model.score(Xt2, yt), model.score(Xv2, yv))

1.0 -1.4373947906617475
0.8512759976961152 0.4455662968177406
0.9428191913232364 0.1128159503875733
0.9989736105991184 -17.24535466155887


  positive)


In [38]:
print(model.score(Xt2, yt), model.score(Xv2, yv))

0.9989736105991184 -17.24535466155887


### Record the Metrics
These models (except Lasso) approach perfect. We call this the Feats2 series.

Unlike the examples above, we fit for both x,y points, and both tasks

In [54]:
cols2 = [c for c in Xt.columns if 'pts' in str(c)]
Xt2, Xv2 = Xt.loc[:,cols2], Xv.loc[:,cols2]

yts = (yt_tlx, yt_tly, yt_crx, yt_cry)
yvs = (yv_tlx, yv_tly, yv_crx, yv_cry)

In [62]:
df_list = []

In [63]:
preds_v, preds_t = [],[]
for yt, yv in zip(yts, yvs):
    model = Lasso(alpha=1.0)
    model.fit(Xt2, yt)
    preds_t.append(model.predict(Xt2))
    preds_v.append(model.predict(Xv2))

df = sk_metrics_df(preds_v, preds_t, 'Lasso Feats2', 
                   'alpha=1.0 | feats 12,22')
display(df)
df_list.append(df)

Unnamed: 0,model,details,target,split,r2,mse,mae,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,Lasso Feats2,"alpha=1.0 | feats 12,22",topleft,train,0.364474,0.037349,0.143431,0.223391,0.25133,0.074697,0.365597
1,Lasso Feats2,"alpha=1.0 | feats 12,22",center,train,0.484503,0.006504,0.067685,0.102772,0.287295,0.013009,0.484405
2,Lasso Feats2,"alpha=1.0 | feats 12,22",topleft,valid,0.385445,0.035222,0.136562,0.215193,0.257405,0.070444,0.385015
3,Lasso Feats2,"alpha=1.0 | feats 12,22",center,valid,0.474663,0.006579,0.068966,0.104684,0.271823,0.013157,0.476715


In [64]:
preds_v, preds_t = [],[]
for yt, yv in zip(yts, yvs):
    model = LinearRegression()
    model.fit(Xt2, yt)
    preds_t.append(model.predict(Xt2))
    preds_v.append(model.predict(Xv2))

df = sk_metrics_df(preds_v, preds_t, 'LR Feats2', 'feats 12,22')
display(df)
df_list.append(df)

Unnamed: 0,model,details,target,split,r2,mse,mae,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,LR Feats2,"feats 12,22",topleft,train,1.0,6.690609e-18,1.122498e-10,2.244996e-10,1.0,1.338122e-17,1.0
1,LR Feats2,"feats 12,22",center,train,0.992927,8.911899e-05,0.00540624,0.008682944,0.939786,0.000178238,0.992936
2,LR Feats2,"feats 12,22",topleft,valid,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,LR Feats2,"feats 12,22",center,valid,0.990422,0.0001186172,0.006249477,0.01028523,0.928457,0.0002372344,0.990565


In [65]:
preds_v, preds_t = [],[]
for yt, yv in zip(yts, yvs):
    model = Ridge()
    model.fit(Xt2, yt)
    preds_t.append(model.predict(Xt2))
    preds_v.append(model.predict(Xv2))

df = sk_metrics_df(preds_v, preds_t, 'Ridge Feats2', 'feats 12,22')
display(df)
df_list.append(df)

Unnamed: 0,model,details,target,split,r2,mse,mae,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,Ridge Feats2,"feats 12,22",topleft,train,0.997573,0.000141,0.00751,0.012034,0.959669,0.000281,0.99761
1,Ridge Feats2,"feats 12,22",center,train,0.985899,0.000178,0.010044,0.01598,0.889182,0.000355,0.985916
2,Ridge Feats2,"feats 12,22",topleft,valid,0.997938,0.000119,0.007239,0.011497,0.960327,0.000238,0.997926
3,Ridge Feats2,"feats 12,22",center,valid,0.982112,0.000223,0.011125,0.017545,0.877961,0.000446,0.982263


In [66]:
preds_v, preds_t = [],[]
for yt, yv in zip(yts, yvs):
    model = RandomForestRegressor()
    model.fit(Xt2, yt)
    preds_t.append(model.predict(Xt2))
    preds_v.append(model.predict(Xv2))

df = sk_metrics_df(preds_v, preds_t, 'RF Feats2', 'feats 12,22')
display(df)
df_list.append(df)

Unnamed: 0,model,details,target,split,r2,mse,mae,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,RF Feats2,"feats 12,22",topleft,train,0.999913,6e-06,0.000251,0.0005,0.998323,1.1e-05,0.999906
1,RF Feats2,"feats 12,22",center,train,0.998491,1.9e-05,0.00161,0.00297,0.979401,3.8e-05,0.998491
2,RF Feats2,"feats 12,22",topleft,valid,0.999737,1.6e-05,0.000373,0.000746,0.997424,3.2e-05,0.999724
3,RF Feats2,"feats 12,22",center,valid,0.989042,0.000136,0.004524,0.008365,0.941816,0.000272,0.989186


In [67]:
df = pd.concat(df_list)

In [68]:
df

Unnamed: 0,model,details,target,split,r2,mse,mae,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,Lasso Feats2,"alpha=1.0 | feats 12,22",topleft,train,0.364474,0.03734861,0.1434309,0.2233909,0.25133,0.07469724,0.365597
1,Lasso Feats2,"alpha=1.0 | feats 12,22",center,train,0.484503,0.006504499,0.06768465,0.1027723,0.287295,0.013009,0.484405
2,Lasso Feats2,"alpha=1.0 | feats 12,22",topleft,valid,0.385445,0.03522212,0.136562,0.2151932,0.257405,0.07044423,0.385015
3,Lasso Feats2,"alpha=1.0 | feats 12,22",center,valid,0.474663,0.006578582,0.06896646,0.1046843,0.271823,0.01315717,0.476715
0,LR Feats2,"feats 12,22",topleft,train,1.0,6.690609e-18,1.122498e-10,2.244996e-10,1.0,1.338122e-17,1.0
1,LR Feats2,"feats 12,22",center,train,0.992927,8.911899e-05,0.00540624,0.008682944,0.939786,0.000178238,0.992936
2,LR Feats2,"feats 12,22",topleft,valid,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,LR Feats2,"feats 12,22",center,valid,0.990422,0.0001186172,0.006249477,0.01028523,0.928457,0.0002372344,0.990565
0,Ridge Feats2,"feats 12,22",topleft,train,0.997573,0.0001406817,0.007509705,0.01203411,0.959669,0.0002813634,0.99761
1,Ridge Feats2,"feats 12,22",center,train,0.985899,0.0001776833,0.01004444,0.01597998,0.889182,0.0003553666,0.985916


In [71]:
df.to_csv('assets/metrics-dfs/metrics4-perfectml-feats2.csv', index=False)