### Metrics-6b: Variations on fastai.Tabular
Study the change in perf for a variety of alterations to the modelling with fastai.Tabular

Variations:
 - concise X vs. all pts X vs. all pix + pts X
 - normalized x vs unnormalized
 - normalized y vs unnormalized
 - topleft vs center
 - number of fit epochs

In [1]:
import os, sys
import copy as copyroot
import pandas as pd
from IPython.display import display
from matplotlib import pyplot as plt

from fastai2.basics import *
from fastai2.tabular.all import *

from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.metrics import r2_score

%load_ext autoreload
%autoreload 2

from module.mnist_helpers import build_df, build_tabular_df, build_dls
from module.mnist_metrics import metrics_df

In [35]:
path = untar_data(URLs.MNIST_TINY)
X, Y = build_tabular_df(path)

In [36]:
dls_tl = build_dls(target='topleft')

point_t = dls_tl.transform[1][1]
scale_t = dls_tl.after_item
def my_scale(x): return scale_t(point_t(x))

Y = pd.DataFrame(
        torch.cat( ( my_scale(Y.iloc[:,1:3]), 
                     my_scale(Y.iloc[:,3:5]) ), axis=1),
        columns = Y.columns[1:])

In [38]:
y_tl_cols = ['point_topleft_x', 'point_topleft_y']
y_cr_cols = ['point_center_x',  'point_center_y']

In [39]:
restrict_cols = ["pts22_5","pts22_29","pts22_21","pts11_0",
                "pts12_4", "pts11_2"]
pts_cols      = [col for col in X.columns if 'pts' in str(col)]
all_cols      = [col for col in X.columns]

In [40]:
def grid(d):
    ret = [{}]
    for param_name, values in d.items():
        old_ret = ret.copy()
        new_ret = []
        for value in values:
            for item in old_ret:
                item = item.copy()
                item[param_name] = value
                new_ret.append(item)
                ret = new_ret.copy()
    return ret

In [47]:
d_params = {'target_ind':[0,1],
            'x_sz_enum': [0,1,2],
            'x_norm':    [True, False],
            'y_norm':    [True, False],
            'epochs':    [10,40],
           }

# quick debug mode
# d_params = {'target_ind':[0,1],
#             'x_sz_enum': [0,1,2],
#             'x_norm':    [True, False],
#             'y_norm':    [True, False],
#             'epochs':    [3],
#            }

In [48]:
params = grid(d_params)
print(len(params))
# params[:3]

48


In [None]:
t0 = time.time()
ledger = []
for param in grid(d_params):
    
    y_cols = [y_tl_cols, y_cr_cols][param['target_ind']]
    
    x_cols = [restrict_cols, pts_cols, all_cols][param['x_sz_enum']]
        
    data = pd.concat((X[x_cols], Y[y_cols]), axis=1)
    
    epochs = param['epochs']
    
    procs =  []
    if param['x_norm']: procs += [Normalize]
    
    ss = None
    if param['y_norm']:
        ss = StandardScaler()
        ss.fit(data[y_cols])
        data[y_cols] = ss.transform(data[y_cols])
        
    dls = TabularDataLoaders.from_df(data, 
                                     path='.', 
                                     y_names=y_cols,
                                     procs=procs)
    
    learn = tabular_learner(dls,)
    
    with learn.no_logging():
        learn.fit(epochs)
    
    s_details =   ''
    s_details += f"x={['restrict', 'pts', 'all'][param['x_sz_enum']]} "
    s_details += f"y_norm={param['y_norm']} "
    s_details += f"x_norm={param['x_norm']} "
    s_details += f"epochs={param['epochs']} "
    
    s_target = ["topleft", "center"][param["target_ind"]]
    
    metrics = metrics_df(learn,
                         s_model="fastTab1.2",
                         s_details=s_details,
                         s_target=s_target,
                         y_scaler=ss)
    
    ledger.append(metrics)

print(f"{round(time.time() - t0, 0)} secs")

#### Join Tables

In [53]:
df = pd.concat(ledger)
print(df.shape)
df.head(2)

(96, 11)


Unnamed: 0,model,details,target,split,mse,mae,r2,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,fastTab1.2,x=restrict y_norm=True x_norm=True epochs=10,topleft,valid,0.000275,0.010506,0.995243,0.018657,0.920853,0.000549,0.992777
1,fastTab1.2,x=restrict y_norm=True x_norm=True epochs=10,topleft,train,0.000276,0.010818,0.994929,0.019155,0.916686,0.000552,0.992422


#### Create New Cols from details

In [62]:
feats = ['x_cols','y_norm', 'x_norm', 'epochs']
for i, name in enumerate(feats):
    
    df[name] = df['details'].apply(
                    lambda s: s.split(' ')[i].split('=')[1])

In [90]:
cols = []
cols += list(df.columns[:3])
cols += list(df.columns[-4:])
cols += [e for e in df.columns if e not in cols]
df = df[cols]

#### Pivot on `x_cols`
Still have problems with all_cols...discard these from analysis

In [77]:
tmp = df[(df['target'] == 'topleft') & (df['split'] == 'valid')]

In [81]:
tmp.groupby('x_cols').agg(['min','mean','max'])['mse'].round(4)

Unnamed: 0_level_0,min,mean,max
x_cols,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
all,1951.0884,1.243754e+20,7.619093e+20
pts,0.0003,0.0016,0.0034
restrict,0.0001,0.0009,0.0022


In [82]:
tmp = df[ (  (df['target'] == 'topleft') 
           & (df['split']  == 'valid')
           & (df['x_cols'] != 'all')    )]

In [84]:
for feat in feats:
    display(tmp.groupby(feat).agg(['min', 'mean','max'])['mse'].round(4))

Unnamed: 0_level_0,min,mean,max
x_cols,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pts,0.0003,0.0016,0.0034
restrict,0.0001,0.0009,0.0022


Unnamed: 0_level_0,min,mean,max
y_norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.0007,0.0022,0.0034
True,0.0001,0.0003,0.0005


Unnamed: 0_level_0,min,mean,max
x_norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.0001,0.0011,0.0026
True,0.0002,0.0013,0.0034


Unnamed: 0_level_0,min,mean,max
epochs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,0.0001,0.0011,0.0029
40,0.0002,0.0013,0.0034


Surprsing: epochs=10 outperfroms 40, and x_norm hurts us.

turning on y_norm has the biggest effect

In [85]:
for feat in feats:
    display(tmp.groupby(feat).agg(['min', 'mean','max'])['r2'].round(4))

Unnamed: 0_level_0,min,mean,max
x_cols,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pts,0.8149,0.9207,0.9932
restrict,0.821,0.9471,0.9965


Unnamed: 0_level_0,min,mean,max
y_norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.8149,0.8745,0.9428
True,0.9889,0.9932,0.9965


Unnamed: 0_level_0,min,mean,max
x_norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.821,0.9319,0.9945
True,0.8149,0.9358,0.9965


Unnamed: 0_level_0,min,mean,max
epochs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,0.8149,0.939,0.9952
40,0.821,0.9287,0.9965


We also see that going from x_cols restrict -> pts has a negative effect, but not much.

In [97]:
tmp = df[ (  (df['split']  == 'valid')
           & (df['x_cols'] != 'all')    )]

for feat in feats:
    display(tmp.groupby(['target', feat]).agg(['min', 'mean','max'])['r2'].round(4))

Unnamed: 0_level_0,Unnamed: 1_level_0,min,mean,max
target,x_cols,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
center,pts,0.6683,0.8858,0.9724
center,restrict,0.2343,0.4347,0.5509
topleft,pts,0.8149,0.9207,0.9932
topleft,restrict,0.821,0.9471,0.9965


Unnamed: 0_level_0,Unnamed: 1_level_0,min,mean,max
target,y_norm,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
center,False,0.2343,0.588,0.8663
center,True,0.4525,0.7325,0.9724
topleft,False,0.8149,0.8745,0.9428
topleft,True,0.9889,0.9932,0.9965


Unnamed: 0_level_0,Unnamed: 1_level_0,min,mean,max
target,x_norm,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
center,False,0.2343,0.6623,0.9724
center,True,0.3234,0.6582,0.9718
topleft,False,0.821,0.9319,0.9945
topleft,True,0.8149,0.9358,0.9965


Unnamed: 0_level_0,Unnamed: 1_level_0,min,mean,max
target,epochs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
center,10,0.4525,0.6637,0.9724
center,40,0.2343,0.6568,0.9718
topleft,10,0.8149,0.939,0.9952
topleft,40,0.821,0.9287,0.9965


For the center task, it's actually better to have the all features frompts in x_cols (which makes sense because there's no perfect predictor feature.) So feature avaialbility is a t a premium on the center task, followed by y_norm'ing. In this case, x_norm'ing does sometimes help, especially with bad fits.

#### Display Topleft + Valid

In [92]:
tmp = df[(df['target'] == 'topleft') & (df['split'] == 'valid')]
tmp.sort_values(by='mse', ascending=True).round(4)

Unnamed: 0,model,details,target,x_cols,y_norm,x_norm,epochs,split,mse,mae,r2,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,fastTab1.2,x=restrict y_norm=True x_norm=False epochs=10,topleft,restrict,True,False,10,valid,0.0001,0.0077,0.9945,0.0123,0.946,0.0002,0.9974
0,fastTab1.2,x=restrict y_norm=True x_norm=True epochs=40,topleft,restrict,True,True,40,valid,0.0002,0.0085,0.9965,0.0143,0.9404,0.0003,0.9959
0,fastTab1.2,x=restrict y_norm=True x_norm=True epochs=10,topleft,restrict,True,True,10,valid,0.0003,0.0105,0.9952,0.0187,0.9209,0.0005,0.9928
0,fastTab1.2,x=pts y_norm=True x_norm=True epochs=10,topleft,pts,True,True,10,valid,0.0003,0.011,0.9932,0.0187,0.9217,0.0006,0.9927
0,fastTab1.2,x=pts y_norm=True x_norm=False epochs=40,topleft,pts,True,False,40,valid,0.0003,0.0124,0.9927,0.0204,0.9203,0.0006,0.9937
0,fastTab1.2,x=pts y_norm=True x_norm=True epochs=40,topleft,pts,True,True,40,valid,0.0003,0.0118,0.9921,0.0197,0.9175,0.0006,0.9924
0,fastTab1.2,x=restrict y_norm=True x_norm=False epochs=40,topleft,restrict,True,False,40,valid,0.0004,0.0123,0.9929,0.0215,0.9067,0.0007,0.9901
0,fastTab1.2,x=pts y_norm=True x_norm=False epochs=10,topleft,pts,True,False,10,valid,0.0005,0.0146,0.9889,0.0243,0.9081,0.0009,0.9903
0,fastTab1.2,x=restrict y_norm=False x_norm=False epochs=10,topleft,restrict,False,False,10,valid,0.0007,0.0199,0.9428,0.0313,0.8634,0.0013,0.9817
0,fastTab1.2,x=restrict y_norm=False x_norm=True epochs=40,topleft,restrict,False,True,40,valid,0.0014,0.0287,0.8963,0.0441,0.8142,0.0029,0.9622


#### Display Center + Valid

In [95]:
tmp = df[(df['target'] == 'center') & (df['split'] == 'valid')]
tmp.sort_values(by='mse', ascending=True).round(4)

Unnamed: 0,model,details,target,x_cols,y_norm,x_norm,epochs,split,mse,mae,r2,dist_avg,dist_r2,sqdist_avg,sqdist_r2
0,fastTab1.2,x=pts y_norm=True x_norm=True epochs=40,center,pts,True,True,40,valid,0.0001,0.0087,0.9718,0.0144,0.8628,0.0003,0.9797
0,fastTab1.2,x=pts y_norm=True x_norm=True epochs=10,center,pts,True,True,10,valid,0.0001,0.0089,0.9669,0.0147,0.8519,0.0003,0.9753
0,fastTab1.2,x=pts y_norm=True x_norm=False epochs=10,center,pts,True,False,10,valid,0.0002,0.0089,0.9724,0.0144,0.8683,0.0003,0.9781
0,fastTab1.2,x=pts y_norm=True x_norm=False epochs=40,center,pts,True,False,40,valid,0.0002,0.0084,0.9691,0.0137,0.8704,0.0003,0.9771
0,fastTab1.2,x=pts y_norm=False x_norm=True epochs=40,center,pts,False,True,40,valid,0.0008,0.0229,0.8663,0.0356,0.6779,0.0017,0.8859
0,fastTab1.2,x=pts y_norm=False x_norm=False epochs=40,center,pts,False,False,40,valid,0.0009,0.0229,0.8582,0.0359,0.6653,0.0017,0.8737
0,fastTab1.2,x=pts y_norm=False x_norm=False epochs=10,center,pts,False,False,10,valid,0.0013,0.0271,0.8138,0.0431,0.6084,0.0026,0.8256
0,fastTab1.2,x=restrict y_norm=True x_norm=False epochs=10,center,restrict,True,False,10,valid,0.002,0.0294,0.4954,0.0515,0.4741,0.004,0.6541
0,fastTab1.2,x=pts y_norm=False x_norm=True epochs=10,center,pts,False,True,10,valid,0.0021,0.0331,0.6683,0.0531,0.5111,0.0042,0.7043
0,fastTab1.2,x=restrict y_norm=True x_norm=True epochs=40,center,restrict,True,True,40,valid,0.0021,0.0322,0.5509,0.0536,0.4772,0.0043,0.6756


#### Save

In [94]:
df.to_csv('assets/metrics6b-fasttab-grid.csv', index=False)

#### Plots
Maybe with log scaling?