# Super Learner

## Preliminary experiments

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sb
from numpy.linalg import inv

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import random

In [3]:
from pmlb import fetch_data, regression_dataset_names

print(regression_dataset_names)

['1027_ESL', '1028_SWD', '1029_LEV', '1030_ERA', '1089_USCrime', '1096_FacultySalaries', '1191_BNG_pbc', '1193_BNG_lowbwt', '1196_BNG_pharynx', '1199_BNG_echoMonths', '1201_BNG_breastTumor', '1203_BNG_pwLinear', '1595_poker', '192_vineyard', '195_auto_price', '197_cpu_act', '201_pol', '207_autoPrice', '210_cloud', '215_2dplanes', '218_house_8L', '225_puma8NH', '227_cpu_small', '228_elusage', '229_pwLinear', '230_machine_cpu', '294_satellite_image', '344_mv', '4544_GeographicalOriginalofMusic', '485_analcatdata_vehicle', '503_wind', '505_tecator', '519_vinnie', '522_pm10', '523_analcatdata_neavote', '527_analcatdata_election2000', '529_pollen', '537_houses', '542_pollution', '547_no2', '556_analcatdata_apnea2', '557_analcatdata_apnea1', '560_bodyfat', '561_cpu', '562_cpu_small', '564_fried', '573_cpu_act', '574_house_16H', '579_fri_c0_250_5', '581_fri_c3_500_25', '582_fri_c1_500_25', '583_fri_c1_1000_50', '584_fri_c4_500_25', '586_fri_c3_1000_25', '588_fri_c4_1000_100', '589_fri_c2_1000

In [60]:
list_dataset = []

for dataset in regression_dataset_names:
    X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
    if X.shape[0] >= 1000:
        print(dataset, X.shape)
        list_dataset.append(dataset)

1028_SWD (1000, 10)
1029_LEV (1000, 4)
1030_ERA (1000, 4)
1191_BNG_pbc (1000000, 18)
1193_BNG_lowbwt (31104, 9)
1196_BNG_pharynx (1000000, 10)
1199_BNG_echoMonths (17496, 9)
1201_BNG_breastTumor (116640, 9)
1203_BNG_pwLinear (177147, 10)
1595_poker (1025010, 10)
197_cpu_act (8192, 21)
201_pol (15000, 48)
215_2dplanes (40768, 10)
218_house_8L (22784, 8)
225_puma8NH (8192, 8)
227_cpu_small (8192, 12)
294_satellite_image (6435, 36)
344_mv (40768, 10)
4544_GeographicalOriginalofMusic (1059, 117)
503_wind (6574, 14)
529_pollen (3848, 4)
537_houses (20640, 8)
562_cpu_small (8192, 12)
564_fried (40768, 10)
573_cpu_act (8192, 21)
574_house_16H (22784, 16)
583_fri_c1_1000_50 (1000, 50)
586_fri_c3_1000_25 (1000, 25)
588_fri_c4_1000_100 (1000, 100)
589_fri_c2_1000_25 (1000, 25)
590_fri_c0_1000_50 (1000, 50)
592_fri_c4_1000_25 (1000, 25)
593_fri_c1_1000_10 (1000, 10)
595_fri_c0_1000_10 (1000, 10)
598_fri_c0_1000_25 (1000, 25)
599_fri_c2_1000_5 (1000, 5)
606_fri_c2_1000_10 (1000, 10)
607_fri_c4_100

In [5]:
len(list_dataset)

46

In [6]:
X, y = fetch_data(list_dataset[3], return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
train_X, test_X, train_y, test_y = train_test_split(X, y)
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [7]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 5)]
max_depth = [int(x) for x in np.linspace(3, 30, num = 5)]
max_depth.append(None)
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth}

In [8]:
print(random_grid)

{'n_estimators': [10, 57, 105, 152, 200], 'max_depth': [3, 9, 16, 23, 30, None]}


In [9]:
#rf = RandomForestRegressor()
#rf_cv = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2,
#                           random_state=42, n_jobs = 2)
#rf_cv.fit(train_X, train_y)

In [10]:
alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 8, 16, 32, 64, 132]
def other_scores(train_X, test_X, train_y, test_y):
    rf = RandomForestRegressor(n_estimators=10, max_depth=15, n_jobs=10)
    ridge  = RidgeCV(cv=5, alphas=alphas)
    lasso = ElasticNetCV(cv=5, random_state=0, l1_ratio=1)
    dt = DecisionTreeRegressor(min_samples_leaf=10)
    
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)
    
    rf.fit(train_X, train_y)
    lasso.fit(train_X, train_y)
    ridge.fit(train_X, train_y)
    dt.fit(train_X, train_y)
    scores = [x.score(test_X, test_y) for x in [rf, ridge, lasso, dt]]
    return scores

## Conditionally interpretable super learner

In [11]:
def random_assignments(train_X, K=6):
    data = {'index': range(len(train_X)), 'group':  np.random.choice(K, len(train_X)) }
    df = pd.DataFrame(data)
    return df

groups = random_assignments(train_X, K=6)
groups.head()

Unnamed: 0,index,group
0,0,5
1,1,0
2,2,3
3,3,3
4,4,2


In [12]:
alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 8, 16, 32, 64, 132]
class BaseModel:
    def __init__(self, model_type):
        self.model_type = model_type
        self.model = self.create_model()
        if model_type not in range(1,7):
            print("model_type should be in the interval [1, 6]")
    
    def create_model(self):
        method_name = 'model_' + str(self.model_type)
        method = getattr(self, method_name, lambda: "nothing")
        return method()
    
    def model_1(self):
        return RidgeCV(cv=5, alphas=alphas)
    
    def model_2(self):
        return ElasticNetCV(cv=5, random_state=0, l1_ratio=0.5)

    def model_3(self):
        return ElasticNetCV(cv=5, random_state=0, l1_ratio=1)

    def model_4(self):
        return DecisionTreeRegressor(max_depth=1)
    
    def model_5(self):
        return DecisionTreeRegressor(max_depth=3)
    
    def model_6(self):
        return DecisionTreeRegressor(max_depth=5)

In [13]:
r = BaseModel(6)
type(r.model)

sklearn.tree.tree.DecisionTreeRegressor

In [14]:
alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 8, 16, 32, 64, 132]
def fit_K_models(train_X, train_y, groups, model_types, K=6):
    models = []
    for k in range(K):
        ind = groups[groups["group"] == k].index.values
        X = train_X[ind]
        y = train_y[ind]
        if len(ind) > 10:
            base_model = BaseModel(model_types[k])
            base_model.model.fit(X, y)
            models.append(base_model)
    return models

In [15]:
def compute_K_model_loss(train_X, train_y, models):
    L = []
    for i in range(len(models)):
        loss = (models[i].model.predict(train_X) - train_y)**2
        L.append(loss)
    L = np.array(L)
    return L

In [16]:
def compute_weights(L, K):
    JI_K = inv(np.ones((K, K)) - np.identity(K))
    W = []
    for i in range(L.shape[1]):
        w_i = np.matmul(JI_K, L[:,i])
        W.append(w_i)
    return np.array(W)

In [17]:
def create_extended_dataset(train_X, train_y, models):
    K = len(models)
    N = train_X.shape[0]
    L = compute_K_model_loss(train_X, train_y, models)
    W = compute_weights(L, K)
    X_ext = []
    y_ext = []
    w_ext = []
    for i in range(K):
        X_ext.append(train_X.copy())
        y_ext.append(i*np.ones(N))
        w_ext.append(W[:, i])
    X_ext = np.concatenate(X_ext, axis=0)
    y_ext = np.concatenate(y_ext, axis=0)
    w_ext = np.concatenate(w_ext, axis=0)
    return X_ext, y_ext, w_ext

## Neural Network oracle

In [18]:
def create_model(D_in, K, H=512):
    model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        nn.BatchNorm1d(H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, K))
    return model

In [19]:
def softmax_loss(beta, f_hat, y, w):
    y_hat = np.exp(beta*f_hat)
    den = (np.exp(beta*f_hat)).sum(axis=1)
    y_hat = np.array([y_hat[i]/den[i] for i in range(len(den))])
    loss = w*((y * (1- y_hat)).sum(axis=1))
    return loss.mean()

In [20]:
beta = torch.tensor(1).float()
f_hat = torch.tensor([[1, 2, 4], [1, 2, 3]]).float()
y = torch.tensor([[0, 0, 1], [0, 0, 1]]).float()
w = torch.tensor([1, 1]).float()
#sofmax_loss(beta, f_hat, y, w)

In [21]:
f_hat = beta*f_hat
y_hat = F.softmax(f_hat, dim=1)
loss = (y*(1-y_hat)).sum(dim=1)
(w*loss).mean()

tensor(0.2455)

In [22]:
def bounded_loss(beta, y_hat, y , w):
    y_hat = beta*y_hat
    y_hat = F.softmax(y_hat, dim=1)
    loss = (y*(1-y_hat)).sum(dim=1)
    return (w*loss).mean()

In [23]:
def train_model(model, train_dl, K, learning_rate = 0.01, epochs=100):
    beta = 1
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    KK = epochs//10
    for t in range(epochs):
        total_loss = 0
        total = 0
        for x, y, w in train_dl:
            x = x.cuda().float()
            y = y.cuda().long()
            w = w.cuda().float()
            y_onehot = torch.FloatTensor(y.shape[0], K).cuda()
            y_onehot.zero_()
            y_onehot = y_onehot.scatter_(1, y.unsqueeze(1), 1)
            y_hat = model(x)
            loss = bounded_loss(beta, y_hat, y_onehot , w)
       
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()*y.size(0)
            total += y.size(0)
        if t % KK == 0: print(total_loss/total)

In [24]:
def reasign_points(train_X, model):
    x = torch.tensor(train_X).float()
    y_hat = model(x.cuda())
    _, pred = torch.max(y_hat, 1)
    data = {'index': range(len(train_X)), 'group': pred.cpu().numpy()  }
    return pd.DataFrame(data) 

In [25]:
def relabel_groups(groups, models):
    unique_models = groups.group.unique()
    old2new = {x:i for i,x in enumerate(unique_models)}
    ratios = []
    model_types = [models[i].model_type for i in unique_models]
    groups.group = np.array([old2new[x] for x in groups.group.values])
    return groups, model_types

In [26]:
from sklearn.metrics import r2_score

In [27]:
def compute_loss(X, y, oracle, models):
    x = torch.tensor(X).float()
    y = torch.tensor(y).float()
    y_hat = oracle(x.cuda())
    _, ass = torch.max(y_hat, 1)
    preds = []
    ys = []
    for i in range(len(models)):
        xx = x[ass==i]
        yy = y[ass==i]
        if len(xx) > 0:
            pred = models[i].model.predict(xx.cpu().numpy())
            preds.append(pred)
            ys.append(yy.cpu().numpy())
    preds = np.hstack(preds)
    ys = np.hstack(ys)
    r2 = r2_score(ys, preds)
    res = (ys - preds)**2
    return res.mean(), r2

In [45]:
def compute_single_loss(X, y, model):
    pred = model.model.predict(X)
    r2 = r2_score(y, pred)
    res = (y - pred)**2
    return res.mean(), r2

## Loop

In [28]:
def random_assignments(train_X, K=6):
    data = {'index': range(len(train_X)), 'group':  np.random.choice(K, len(train_X)) }
    df = pd.DataFrame(data)
    return df

In [29]:
class OracleDataset(Dataset):
    def __init__(self, X, y, w):
        self.X = X
        self.y = y
        self.w = w
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.w[idx]

In [39]:
# difficult problems 5, 7, 22
dataset = list_dataset[1]
print(dataset)
X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
train_X, test_X, train_y, test_y = train_test_split(X, y)
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

1029_LEV


In [40]:
print(train_X.shape)

(750, 4)


In [58]:
K = 6
groups = random_assignments(train_X, K)

batch_size = 100000

In [59]:
f = open('out2.log', 'w+')
batch_size = 100000
# number of iterations depends on the number of training points
N = train_X.shape[0]
N_iter = int(10000/np.log(N)**2)
print("Number of training points %d, number iterations %d" % (N, N_iter))

best_train_r2 = None
best_K = None
best_test_r2 = None
model_types = range(1,7)
for i in range(20):
    print("iter", i)
    models = fit_K_models(train_X, train_y, groups, model_types, K)
    K = len(models)
    print("K is ", K)
    if K == 1:
        models[0].model.fit(train_X, train_y)
        train_loss, train_r2 = compute_single_loss(train_X, train_y, models[0])
        test_loss, test_r2 = compute_single_loss(test_X, test_y, models[0])
        if train_r2 >= best_train_r2:
            best_train_r2 = train_r2
            best_test_r2 = test_r2
            best_K = K
        break
    
    print("models")
    X_ext, y_ext, w_ext = create_extended_dataset(train_X, train_y, models)
    print("extended")
    train_ds = OracleDataset(X_ext, y_ext, w_ext)
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    model = create_model(train_X.shape[1], K, H=100).cuda()
    train_model(model, train_dl, K, 0.01, N_iter)
    groups = reasign_points(train_X, model)
    if len(groups.group.unique()) < K:
        K = len(groups.group.unique()) 
        groups, model_types = relabel_groups(groups, models)
    train_loss, train_r2 = compute_loss(train_X, train_y, model, models)
    if best_train_r2 == None:
        best_train_r2 = train_r2
        
    test_loss, test_r2 = compute_loss(test_X, test_y, model, models)
    if train_r2 >= best_train_r2:
        best_train_r2 = train_r2
        best_test_r2 = test_r2
        best_K = K
    print("loss", train_loss, test_loss)
    print("R^2", train_r2, test_r2)
    print("best test_r2", best_test_r2)

        
scores = other_scores(train_X, test_X, train_y, test_y)
model_str = ["RF", "Ridge", "Lasso", "Cart"]
score_str = ["%s %.4f" % (s, score) for s,score in zip(model_str, scores)]
score_str = " ".join(score_str)
results = "dataset %s K %d ISL %.4f %s"  %(dataset, best_K, best_test_r2, score_str)
print(results)
f.write(results)
f.write('\n')
f.flush()

Number of training points 750, number iterations 228
iter 0
K is  6
models
extended
0.08806014060974121




0.06872911751270294
0.0670401081442833
0.06642203778028488
0.06605097651481628
0.06574933975934982
0.06559977680444717
0.06550410389900208
0.06543835252523422
0.06538630276918411
0.06534914672374725
loss 0.39144440109478273 0.34945769247905784
R^2 0.569525943295055 0.6169906943454344
best test_r2 0.6169906943454344
iter 1




K is  3
models
extended
0.17133112251758575
0.1394306868314743
0.1329386681318283
0.1306695193052292
0.12985892593860626
0.1294344812631607
0.12912632524967194
0.12884938716888428
0.1285523623228073
0.12838436663150787
0.12828843295574188
loss 0.38413886145061715 0.36181266627290587
R^2 0.577559894676777 0.6034495131495283
best test_r2 0.6034495131495283
iter 2
K is  3
models
extended
0.166066974401474
0.13813632726669312
0.13406679034233093
0.131466343998909
0.13088732957839966
0.1305171102285385
0.13021598756313324
0.12998031079769135
0.1298103630542755
0.12929728627204895
0.12903793156147003
loss 0.3860956175894295 0.3748673487245655
R^2 0.5754080367099725 0.5891414439069244
best test_r2 0.6034495131495283
iter 3
K is  3
models
extended
0.16999515891075134
0.13558118045330048
0.13117189705371857
0.13009963929653168
0.12944909930229187
0.12872549891471863
0.1282172054052353
0.12778575718402863
0.12737184762954712
0.12723951041698456
0.12716607749462128
loss 0.38096610758640675 0.3759

In [57]:
model_types

[3]

In [None]:
print(list_dataset[0], best_K, test_r2)

In [None]:
other_scores(train_X, test_X, train_y, test_y)

In [None]:
ratios

In [None]:
train_X.shape

In [None]:
train_X[1,:]