# Super Learner

## Preliminary experiments

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sb
from numpy.linalg import inv

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import random

In [4]:
from pmlb import fetch_data, regression_dataset_names

print(regression_dataset_names)

['1027_ESL', '1028_SWD', '1029_LEV', '1030_ERA', '1089_USCrime', '1096_FacultySalaries', '1191_BNG_pbc', '1193_BNG_lowbwt', '1196_BNG_pharynx', '1199_BNG_echoMonths', '1201_BNG_breastTumor', '1203_BNG_pwLinear', '1595_poker', '192_vineyard', '195_auto_price', '197_cpu_act', '201_pol', '207_autoPrice', '210_cloud', '215_2dplanes', '218_house_8L', '225_puma8NH', '227_cpu_small', '228_elusage', '229_pwLinear', '230_machine_cpu', '294_satellite_image', '344_mv', '4544_GeographicalOriginalofMusic', '485_analcatdata_vehicle', '503_wind', '505_tecator', '519_vinnie', '522_pm10', '523_analcatdata_neavote', '527_analcatdata_election2000', '529_pollen', '537_houses', '542_pollution', '547_no2', '556_analcatdata_apnea2', '557_analcatdata_apnea1', '560_bodyfat', '561_cpu', '562_cpu_small', '564_fried', '573_cpu_act', '574_house_16H', '579_fri_c0_250_5', '581_fri_c3_500_25', '582_fri_c1_500_25', '583_fri_c1_1000_50', '584_fri_c4_500_25', '586_fri_c3_1000_25', '588_fri_c4_1000_100', '589_fri_c2_1000

In [5]:
list_dataset = []

for dataset in regression_dataset_names:
    X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
    if X.shape[0] >= 1000:
        list_dataset.append(dataset)

In [6]:
len(list_dataset)

46

In [7]:
X, y = fetch_data(list_dataset[3], return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
train_X, test_X, train_y, test_y = train_test_split(X, y)
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [8]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 5)]
max_depth = [int(x) for x in np.linspace(3, 30, num = 5)]
max_depth.append(None)
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth}

In [9]:
print(random_grid)

{'n_estimators': [10, 57, 105, 152, 200], 'max_depth': [3, 9, 16, 23, 30, None]}


In [11]:
#rf = RandomForestRegressor()
#rf_cv = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2,
                           random_state=42, n_jobs = 2)
#rf_cv.fit(train_X, train_y)

IndentationError: unexpected indent (<ipython-input-11-fd670b308a54>, line 3)

In [13]:
def other_scores(train_X, test_X, train_y, test_y):
    rf_reg = RandomForestRegressor(n_estimators=10, max_depth=15, n_jobs=10)
    ridge_reg = RidgeCV(alphas=alphas)
    lasso_reg = LassoCV(cv=5, random_state=0)

    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)
    
    rf_reg.fit(train_X, train_y)
    lasso_reg.fit(train_X, train_y)
    ridge_reg.fit(train_X, train_y)

    return rf_reg.score(test_X, test_y), ridge_reg.score(test_X, test_y), lasso_reg.score(test_X, test_y)

## Conditionally interpretable super learner

In [34]:
def random_assignments(train_X, K=6):
    data = {'index': range(len(train_X)), 'group':  np.random.choice(K, len(train_X)) }
    df = pd.DataFrame(data)
    return df

groups = random_assignments(train_X, K=6)
groups.head()

Unnamed: 0,index,group
0,0,4
1,1,0
2,2,4
3,3,2
4,4,1


In [35]:
def fit_K_models(train_X, train_y, groups, alpha, K=6):
    models = []
    for k in range(K):
        ind = groups[groups["group"] == k].index.values
        X = train_X[ind]
        y = train_y[ind]
        ridge_reg = Ridge(alpha=alpha)
        ridge_reg.fit(X, y)
        models.append(ridge_reg)
    return models

In [36]:
def compute_K_model_loss(train_X, train_y, models):
    L = []
    for i in range(len(models)):
        loss = (models[i].predict(train_X) - train_y)**2
        L.append(loss)
    L = np.array(L)
    return L

In [37]:
def compute_weights(L, K):
    JI_K = inv(np.ones((K, K)) - np.identity(K))
    W = []
    for i in range(L.shape[1]):
        w_i = np.matmul(JI_K, L[:,i])
        W.append(w_i)
    return np.array(W)

In [38]:
def create_extended_dataset(train_X, train_y, models):
    K = len(models)
    print(K)
    N = train_X.shape[0]
    L = compute_K_model_loss(train_X, train_y, models)
    W = compute_weights(L, K)
    print(W.shape)
    X_ext = []
    y_ext = []
    w_ext = []
    for i in range(K):
        X_ext.append(train_X.copy())
        y_ext.append(i*np.ones(N))
        w_ext.append(W[:, i])
    X_ext = np.concatenate(X_ext, axis=0)
    y_ext = np.concatenate(y_ext, axis=0)
    w_ext = np.concatenate(w_ext, axis=0)
    return X_ext, y_ext, w_ext

## Neural Network oracle

In [39]:
def create_model(D_in, K, H=512):
    model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        nn.BatchNorm1d(H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, K))
    return model

In [40]:
def softmax_loss(beta, f_hat, y, w):
    y_hat = np.exp(beta*f_hat)
    den = (np.exp(beta*f_hat)).sum(axis=1)
    y_hat = np.array([y_hat[i]/den[i] for i in range(len(den))])
    loss = w*((y * (1- y_hat)).sum(axis=1))
    return loss.mean()

In [41]:
beta = torch.tensor(1).float()
f_hat = torch.tensor([[1, 2, 4], [1, 2, 3]]).float()
y = torch.tensor([[0, 0, 1], [0, 0, 1]]).float()
w = torch.tensor([1, 1]).float()
#sofmax_loss(beta, f_hat, y, w)

In [42]:
f_hat = beta*f_hat
y_hat = F.softmax(f_hat, dim=1)
loss = (y*(1-y_hat)).sum(dim=1)
(w*loss).mean()

tensor(0.2455)

In [43]:
def bounded_loss(beta, y_hat, y , w):
    y_hat = beta*y_hat
    y_hat = F.softmax(y_hat, dim=1)
    loss = (y*(1-y_hat)).sum(dim=1)
    return (w*loss).mean()

In [44]:
def train_model(model, train_dl, K, learning_rate = 0.01, epochs=100):
    beta = 1
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    KK = epochs//10
    for t in range(epochs):
        total_loss = 0
        total = 0
        for x, y, w in train_dl:
            x = x.cuda().float()
            y = y.cuda().long()
            w = w.cuda().float()
            y_onehot = torch.FloatTensor(y.shape[0], K).cuda()
            y_onehot.zero_()
            y_onehot = y_onehot.scatter_(1, y.unsqueeze(1), 1)
            y_hat = model(x)
            loss = bounded_loss(beta, y_hat, y_onehot , w)
       
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()*y.size(0)
            total += y.size(0)
        if t % KK == 0: print(total_loss/total)

In [45]:
def reasign_points(train_X, model):
    x = torch.tensor(train_X).float()
    y_hat = model(x.cuda())
    _, pred = torch.max(y_hat, 1)
    data = {'index': range(len(train_X)), 'group': pred.cpu().numpy()  }
    return pd.DataFrame(data) 

In [46]:
def relabel_groups(groups):
    old2new = {x:i for i,x in enumerate(groups.group.unique())}
    groups.group = np.array([old2new[x] for x in groups.group.values])
    return groups

In [47]:
from sklearn.metrics import r2_score

In [48]:
def compute_loss(X, y, oracle, models):
    x = torch.tensor(X).float()
    y = torch.tensor(y).float()
    y_hat = oracle(x.cuda())
    _, ass = torch.max(y_hat, 1)
    preds = []
    ys = []
    for i in range(len(models)):
        xx = x[ass==i]
        yy = y[ass==i]
        if len(xx) > 0:
            pred = models[i].predict(xx.cpu().numpy())
            preds.append(pred)
            ys.append(yy.cpu().numpy())
    preds = np.hstack(preds)
    ys = np.hstack(ys)
    r2 = r2_score(ys, preds)
    res = (ys - preds)**2
    return res.mean(), r2

## Loop

In [49]:
def random_assignments(train_X, K=6):
    data = {'index': range(len(train_X)), 'group':  np.random.choice(K, len(train_X)) }
    df = pd.DataFrame(data)
    return df

In [50]:
class OracleDataset(Dataset):
    def __init__(self, X, y, w):
        self.X = X
        self.y = y
        self.w = w
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.w[idx]

In [55]:
# difficult problems 5, 7, 22
X, y = fetch_data(list_dataset[2], return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
train_X, test_X, train_y, test_y = train_test_split(X, y)
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [59]:
alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 8, 16, 32, 64]
ridge_reg = RidgeCV(alphas=alphas)
ridge_reg.fit(train_X, train_y)
alpha = ridge_reg.alpha_

In [60]:
print(alpha) 
K = 6
groups = random_assignments(train_X, K)

8.0


In [61]:
f = open('out.log', 'w+')
batch_size = 100000
# number of iterations depends on the number of training points
N = train_X.shape[0]
N_iter = int(10000/np.log(N)**2)
print("Number of training points %d, number iterations %d" % (N, N_iter))

best_train_r2 = None
best_K = None
best_test_r2 = None
for i in range(5):
    print("K is ", K)
    models = fit_K_models(train_X, train_y, groups, alpha, K)
    print("models")
    X_ext, y_ext, w_ext = create_extended_dataset(train_X, train_y, models)
    print("extended")
    train_ds = OracleDataset(X_ext, y_ext, w_ext)
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    model = create_model(train_X.shape[1], K, H=100).cuda()
    train_model(model, train_dl, K, 0.01, N_iter)
    groups = reasign_points(train_X, model)
    if len(groups.group.unique()) < K:
        K = len(groups.group.unique()) 
        groups = relabel_groups(groups)
    train_loss, train_r2 = compute_loss(train_X, train_y, model, models)
    if best_train_r2 == None:
        best_train_r2 = train_r2
        
    test_loss, test_r2 = compute_loss(test_X, test_y, model, models)
    if train_r2 >= best_train_r2:
        best_train_r2 = train_r2
        best_test_r2 = test_r2
        best_K = K
    print("loss", train_loss, test_loss)
    print("R^2", train_r2, test_r2)
    print("best test_r2", best_test_r2)
    if K == 1:
        print("K", K)
        break

r2_rf, r2_ridge, r2_lasso = other_scores(train_X, test_X, train_y, test_y)
results = "dataset %s K %d ISL_r^2 %.4f RF_r^2 %.4f Ridge_r^2 %.4f Lasso_r^2 %.4f" %(dataset, best_K, best_test_r2, r2_rf, r2_ridge, r2_lasso)
print(results)
f.write(results)
f.write('\n')
f.flush()

Number of training points 750, number iterations 228
K is  6
models
6
(750, 6)
extended
0.4250280559062958
0.4101625680923462
0.40639010071754456
0.40533706545829773
0.4050077497959137
0.4048629403114319
0.4047868549823761
0.40475910902023315
0.404745489358902
0.4047374427318573
0.4047320783138275
loss 2.4282719080885133 2.2115064093700876
R^2 0.39795599979525886 0.38664404278362496
best test_r2 0.38664404278362496
K is  6
models
6
(750, 6)
extended
0.6150661706924438
0.4326910972595215
0.42248833179473877
0.41927003860473633
0.4179413318634033
0.4173830449581146
0.41688966751098633
0.4160519242286682
0.41550707817077637
0.41513335704803467
0.4148763120174408
loss 2.485776852486509 2.328313075371263
R^2 0.3836987386369839 0.3542479963010777
best test_r2 0.38664404278362496
K is  4
models
4
(750, 4)
extended
0.7650496363639832
0.623860776424408
0.6221958994865417
0.6201805472373962
0.6188361644744873
0.6147842407226562
0.612872838973999
0.6120323538780212
0.6114459037780762
0.6113165616

In [None]:
print(dataset, best_K)

In [None]:
other_scores(train_X, test_X, train_y, test_y)

In [204]:
train_X.shape

(750000, 18)

In [205]:
train_X[1,:]

array([-1.26777153, -0.76656866,  0.43695488, -0.31608791,  0.26732423,
       -0.75775435, -1.87686498, -0.1415347 ,  0.00854943,  1.24014095,
        0.09577373,  1.22230522, -0.05583522,  0.69242445, -0.12679788,
        1.92621476,  1.45341647, -1.25307813])