## Selecting Classification Datasets for Super Learner

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sb
from numpy.linalg import inv

In [4]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

In [5]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import random

In [6]:
from pmlb import fetch_data, classification_dataset_names

print(classification_dataset_names)

['GAMETES_Epistasis_2_Way_1000atts_0.4H_EDM_1_EDM_1_1', 'GAMETES_Epistasis_2_Way_20atts_0.1H_EDM_1_1', 'GAMETES_Epistasis_2_Way_20atts_0.4H_EDM_1_1', 'GAMETES_Epistasis_3_Way_20atts_0.2H_EDM_1_1', 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM_2_001', 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM_2_001', 'Hill_Valley_with_noise', 'Hill_Valley_without_noise', 'adult', 'agaricus_lepiota', 'allbp', 'allhyper', 'allhypo', 'allrep', 'analcatdata_aids', 'analcatdata_asbestos', 'analcatdata_authorship', 'analcatdata_bankruptcy', 'analcatdata_boxing1', 'analcatdata_boxing2', 'analcatdata_creditscore', 'analcatdata_cyyoung8092', 'analcatdata_cyyoung9302', 'analcatdata_dmft', 'analcatdata_fraud', 'analcatdata_germangss', 'analcatdata_happiness', 'analcatdata_japansolvent', 'analcatdata_lawsuit', 'ann_thyroid', 'appendicitis', 'australian', 'auto', 'backache', 'balance_scale', 'biomed', 'breast', 'breast_cancer', 'breast_cancer_wisconsin', 'breast_w', 'buggyCrx', 'bupa', 'calendarD

In [7]:
list_dataset = []

for dataset in classification_dataset_names:
    X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
    if X.shape[0] >= 5000:
        list_dataset.append(dataset)

In [8]:
len(list_dataset)

28

In [9]:
def get_class_data(dataset):
    X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
    y_min = np.unique(y).min()
    if y_min == 1:
        y -= 1
    return X, y

In [10]:
i = 2
X, y = get_class_data(list_dataset[i])
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=1)
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [11]:
train_X.shape

(5400, 21)

In [12]:
N = train_X.shape[1]
max_features = [int(x*N+1) for x in np.linspace(0.01, 2, num = 20)]
np.unique(max_features)

array([ 1,  3,  5,  7, 10, 12, 14, 16, 18, 21, 23, 25, 27, 29, 32, 34, 36,
       38, 40, 43])

In [13]:
N = train_X.shape[1]

max_depth = np.unique([int(x*N + 1) for x in np.linspace(0.01, 2, num = 20)])

grid = {'max_depth': max_depth}
rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', n_jobs = 10)


rf_cv = GridSearchCV(estimator = rf, param_grid = grid, cv = 5, verbose=2,
                     n_jobs = 2)
rf_cv.fit(train_X, train_y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  1.4min
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  3.7min finished


GridSearchCV(cv=5,
             estimator=RandomForestClassifier(max_features='sqrt',
                                              n_estimators=1000, n_jobs=10),
             n_jobs=2,
             param_grid={'max_depth': array([ 1,  3,  5,  7, 10, 12, 14, 16, 18, 21, 23, 25, 27, 29, 32, 34, 36,
       38, 40, 43])},
             verbose=2)

In [14]:
rf_cv.best_estimator_

RandomForestClassifier(max_depth=23, max_features='sqrt', n_estimators=1000,
                       n_jobs=10)

In [15]:
rf_cv.score(test_X, test_y)

0.9983333333333333

In [16]:
alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 8, 16, 32, 64, 132]
def other_scores(train_X, test_X, train_y, test_y):
    
    N = train_X.shape[1]
    max_features = np.unique([int(x*N + 1) for x in np.linspace(0.01, 0.99, num = 5)])
    grid = {'max_features': max_features}
    rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', n_jobs = 10)
    rf_cv = GridSearchCV(estimator = rf, param_grid = grid, cv = 5, verbose=2,
                         n_jobs = 2)
    
    lasso  = LogisticRegressionCV(cv=5, penalty='l1',solver = 'saga', random_state=0)
    ridge  = LogisticRegressionCV(cv=5, penalty='l2',solver = 'saga', random_state=0)
    dt = DecisionTreeClassifier(max_depth=5)
    
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)
    
    rf_cv.fit(train_X, train_y)
    lasso.fit(train_X, train_y)
    ridge.fit(train_X, train_y)
    dt.fit(train_X, train_y)
    scores = [x.score(test_X, test_y) for x in [rf_cv, ridge, lasso, dt]]
    return scores

In [17]:
other_scores(train_X, test_X, train_y, test_y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  25 out of  25 | elapsed:   55.5s finished


[0.9983333333333333,
 0.9488888888888889,
 0.9488888888888889,
 0.9966666666666667]

In [99]:
dt = DecisionTreeRegressor(min_samples_leaf=10)
dt_cv = GridSearchCV(estimator = dt, param_grid = grid, cv = 5, verbose=2,
                    n_jobs = 20)
dt_cv.fit(train_X, train_y)
dt_cv.score(test_X, test_y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  12 out of  25 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  25 out of  25 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  25 out of  25 | elapsed:    0.5s finished


0.27999061890109034

In [97]:
dt_cv.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=9, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

## Conditionally interpretable super learner

In [18]:
np.random.seed(23)

def random_assignments(train_X, K=6):
    data = {'index': range(len(train_X)), 'group':  np.random.choice(K, len(train_X)) }
    df = pd.DataFrame(data)
    return df

In [19]:
alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 8, 16, 32, 64, 132]
class BaseModel:
    def __init__(self, model_type):
        self.model_type = model_type
        self.model = self.create_model()
        if model_type not in range(1,7):
            print("model_type should be in the interval [1, 6]")

    def create_model(self):
        method_name = 'model_' + str(self.model_type)
        method = getattr(self, method_name, lambda: "nothing")
        return method()

    # L1 penalty
    def model_1(self):
        return LogisticRegressionCV(cv=5, penalty='l1',solver = 'saga', random_state=0)

    # l2 penalty
    def model_2(self):
        return LogisticRegressionCV(cv=5, penalty='l2', solver = 'saga', random_state=0)

    # elastic net
    def model_3(self):
        return LogisticRegressionCV(cv=5, penalty='elasticnet', solver = 'saga', l1_ratios=[.5], random_state=0)

    def model_4(self):
        return DecisionTreeClassifier(max_depth=3)

    def model_5(self):
        return DecisionTreeClassifier(max_depth=4)

    def model_6(self):
        return DecisionTreeClassifier(max_depth=5)

In [20]:
r = BaseModel(6)
type(r.model)

sklearn.tree._classes.DecisionTreeClassifier

In [21]:
alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 8, 16, 32, 64, 132]
def fit_K_models(train_X, train_y, groups, model_types, K=6):
    models = []
    for k in range(K):
        ind = groups[groups["group"] == k].index.values
        X = train_X[ind]
        y = train_y[ind]
        if len(ind) > 10:
            base_model = BaseModel(model_types[k])
            base_model.model.fit(X, y)
            models.append(base_model)
    return models

In [22]:
# This version of fit K models use weights
def fit_K_models_weights(train_X, train_y, groups, ind_base, model_types, K=6):
    models = []
    # this is a hack for now
    for k in range(K):
        ind = groups[groups["group"] == k].index.values
        ind = np.array(ind_base + list(ind))
        X = train_X[ind]
        y = train_y[ind]
        if len(ind) > 10:
            base_model = BaseModel(model_types[k])
            base_model.model.fit(X, y)
            models.append(base_model)
    return models

In [23]:
classes = np.unique(train_y)
hack = {c: (np.where(train_y == c))[0] for c in classes}
ind_base = [hack[c][0] for c in classes]

In [24]:
groups = random_assignments(train_X, K=6)
models = fit_K_models_weights(train_X, train_y, groups, ind_base, [1,2,3,4,5,6], K=6)

In [25]:
# L is an array
def compute_K_model_loss(X, y, models):
    L = []
    for i in range(len(models)):
        y_hat = models[i].model.predict_proba(X)
        print(i, y_hat.shape)
        W = np.eye(y_hat.shape[1])[y] # to avoid the need for num_classes
        loss = (-np.log(y_hat + 1e-8)*W).sum(1)
        L.append(loss)
    L = np.array(L)
    return L

In [26]:
compute_K_model_loss(train_X, train_y, models).shape

0 (5400, 3)
1 (5400, 3)
2 (5400, 3)
3 (5400, 3)
4 (5400, 3)
5 (5400, 3)


(6, 5400)

In [27]:
i = 0
y_hat = models[i].model.predict_proba(train_X)

In [28]:
(np.argmax(y_hat, axis=1) == train_y).sum()/train_y.shape[0]

0.9596296296296296

In [29]:
def compute_weights(L, K):
    JI_K = inv(np.ones((K, K)) - np.identity(K))
    W = []
    for i in range(L.shape[1]):
        w_i = np.matmul(JI_K, L[:,i])
        W.append(w_i)
    return np.array(W)

In [30]:
def create_extended_dataset(train_X, train_y, models):
    K = len(models)
    N = train_X.shape[0]
    L = compute_K_model_loss(train_X, train_y, models)
    W = compute_weights(L, K)
    X_ext = []
    y_ext = []
    w_ext = []
    for i in range(K):
        X_ext.append(train_X.copy())
        y_ext.append(i*np.ones(N))
        w_ext.append(W[:, i])
    X_ext = np.concatenate(X_ext, axis=0)
    y_ext = np.concatenate(y_ext, axis=0)
    w_ext = np.concatenate(w_ext, axis=0)
    return X_ext, y_ext, w_ext

## Neural Network oracle

In [31]:
def create_oracle_model(D_in, K, N):
    """ Returns an oracle model
    
    The size of the hidden layer is a function of the
    amount of training data
    """
    H = int(2*np.log(N)**2)
    model = nn.Sequential(
        nn.Linear(D_in, H),
        nn.BatchNorm1d(H),
        nn.ReLU(),
        torch.nn.Linear(H, K))
    return model
#nn.Dropout(p=0.2),

In [32]:
def softmax_loss(beta, f_hat, y, w):
    y_hat = np.exp(beta*f_hat)
    den = (np.exp(beta*f_hat)).sum(axis=1)
    y_hat = np.array([y_hat[i]/den[i] for i in range(len(den))])
    loss = w*((y * (1- y_hat)).sum(axis=1))
    return loss.mean()

In [33]:
beta = torch.tensor(1).float()
f_hat = torch.tensor([[1, 2, 4], [1, 2, 3]]).float()
y = torch.tensor([[0, 0, 1], [0, 0, 1]]).float()
w = torch.tensor([1, 1]).float()
#sofmax_loss(beta, f_hat, y, w)

In [34]:
f_hat = beta*f_hat
y_hat = F.softmax(f_hat, dim=1)
loss = (y*(1-y_hat)).sum(dim=1)
(w*loss).mean()

tensor(0.2455)

In [35]:
def bounded_loss(beta, y_hat, y , w):
    #y_hat = beta*y_hat
    y_hat = F.softmax(y_hat, dim=1)
    loss = (y*(1-y_hat)).sum(dim=1)
    return (w*loss).mean()

In [36]:
def train_oracle_model(model, train_dl, K, learning_rate = 0.01, epochs=100):
    beta = 1
    wd=0 #0.0001
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=wd)
    KK = epochs//10 + 1
    model.train()
    for t in range(epochs):
        total_loss = 0
        total = 0
        for x, y, w in train_dl:
            x = x.cuda().float()
            y = y.cuda().long()
            w = w.cuda().float()
            y_onehot = torch.FloatTensor(y.shape[0], K).cuda()
            y_onehot.zero_()
            y_onehot = y_onehot.scatter_(1, y.unsqueeze(1), 1)
            y_hat = model(x)
            loss = bounded_loss(beta, y_hat, y_onehot , w)
       
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()*y.size(0)
            total += y.size(0)
        #if t % KK == 0: print(total_loss/total)
        print(total_loss/total)

In [37]:
def reasign_points(train_X, model):
    x = torch.tensor(train_X).float()
    y_hat = model(x.cuda())
    _, pred = torch.max(y_hat, 1)
    data = {'index': range(len(train_X)), 'group': pred.cpu().numpy()  }
    return pd.DataFrame(data) 

In [38]:
def relabel_groups(groups, models):
    unique_models = groups.group.unique()
    old2new = {x:i for i,x in enumerate(unique_models)}
    ratios = []
    model_types = [models[i].model_type for i in unique_models]
    groups.group = np.array([old2new[x] for x in groups.group.values])
    return groups, model_types

In [39]:
def compute_loss(X, y, oracle, models):
    oracle.eval()
    x = torch.tensor(X).float()
    y = torch.tensor(y).float()
    y_hat = oracle(x.cuda())
    _, ass = torch.max(y_hat, 1)
    preds = []
    ys = []
    for i in range(len(models)):
        xx = x[ass==i]
        yy = y[ass==i]
        if len(xx) > 0:
            pred = models[i].model.predict_proba(xx.cpu().numpy())
            preds.append(pred)
            ys.append(yy.cpu().numpy())
            
    preds = np.concatenate(preds)
    ys = np.concatenate(ys)
    logloss = log_loss(ys, preds)
    acc = (np.argmax(preds, axis=1) == ys).sum()/ys.shape[0]
    return logloss, acc

In [40]:
def compute_single_loss(X, y, model):
    pred = model.model.predict_proba(X)
    logloss = log_loss(y, pred)
    acc = model.model.score(X, y)
    return logloss, acc

## Loop

In [41]:
def random_assignments(train_X, K=6):
    data = {'index': range(len(train_X)), 'group':  np.random.choice(K, len(train_X)) }
    df = pd.DataFrame(data)
    return df

In [42]:
class OracleDataset(Dataset):
    def __init__(self, X, y, w):
        self.X = X
        self.y = y
        self.w = w
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.w[idx]

In [43]:
def get_optimizer(model, lr = 0.01, wd = 0.0001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [44]:
PATH = Path("/data2/yinterian/tmp/")
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [53]:
i = 3
X, y = get_class_data(list_dataset[i])
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=1)
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)
print(train_X.shape)

(3750, 20)


In [54]:
K = 6
groups = random_assignments(train_X, K)
f = open('out2.log', 'w+')
batch_size = 100000
# number of iterations depends on the number of training points
N = train_X.shape[0]
N_iter = int(100/np.log(N))
print("Number of training points %d, number iterations %d" % (N, N_iter))

best_train_acc = 0
best_K = None
best_test_acc = 0
model_types = range(1,7)
learning_rate = 0.01

classes = np.unique(train_y)
print("classes", classes)
hack = {c: (np.where(train_y == c))[0] for c in classes}
ind_base = [hack[c][0] for c in classes]


for i in range(5):
    print("iter", i)
    models = fit_K_models_weights(train_X, train_y, groups, ind_base, model_types, K)
    K = len(models)
    print("K is ", K)
    if K == 1:
        models[0].model.fit(train_X, train_y)
        train_loss, train_acc = compute_single_loss(train_X, train_y, models[0])
        test_loss, test_acc = compute_single_loss(test_X, test_y, models[0])
        if train_acc >= best_train_acc:
            best_train_acc = train_acc
            best_test_acc = test_acc
            best_K = K
        break
    
    X_ext, y_ext, w_ext = create_extended_dataset(train_X, train_y, models)
    train_ds = OracleDataset(X_ext, y_ext, w_ext)
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    model = create_oracle_model(train_X.shape[1], K, N).cuda()
    train_oracle_model(model, train_dl, K, learning_rate, N_iter)
    groups = reasign_points(train_X, model)
    if len(groups.group.unique()) < K:
        K = len(groups.group.unique()) 
        groups, model_types = relabel_groups(groups, models)
    train_loss, train_acc = compute_loss(train_X, train_y, model, models)
        
    test_loss, test_acc = compute_loss(test_X, test_y, model, models)
    if train_acc >= best_train_acc:
        best_train_acc = train_acc
        best_test_acc = test_acc
        best_K = K
    print("loss", train_loss, test_loss)
    print("Accuracy", train_acc, test_acc)
    print("best test_acc", best_test_acc)
    
scores = other_scores(train_X, test_X, train_y, test_y)
model_str = ["RF", "Ridge", "Lasso", "Cart"]
score_str = ["%s %.4f" % (s, score) for s,score in zip(model_str, scores)]
score_str = " ".join(score_str)
results = "dataset %s K %d ISL %.4f %s"  %(dataset, best_K, best_test_acc, score_str)
print(results)
f.write(results)
f.write('\n')
f.flush()

Number of training points 3750, number iterations 12
classes [0 1]
iter 0
K is  6
0 (3750, 2)
1 (3750, 2)
2 (3750, 2)
3 (3750, 2)
4 (3750, 2)
5 (3750, 2)
0.0642700120806694
0.05760582163929939
0.05391684174537659
0.051554325968027115
0.049794845283031464
0.04823724180459976
0.046701639890670776
0.045280538499355316
0.04418635740876198
0.04340357705950737
0.042683809995651245
0.04186432436108589
loss 0.24008368639426536 0.3121183038040852
Accuracy 0.9176 0.9024
best test_acc 0.9024
iter 1
K is  4
0 (3750, 2)
1 (3750, 2)
2 (3750, 2)
3 (3750, 2)
0.13198190927505493
0.10527495294809341
0.08745112270116806
0.07545369863510132
0.06731664389371872
0.06212976202368736
0.05885540321469307
0.05653737857937813
0.05465400218963623
0.05300024524331093
0.051525913178920746
0.05023519694805145
loss 0.18766627030746383 0.3773213784591908
Accuracy 0.9370666666666667 0.92
best test_acc 0.92
iter 2
K is  3
0 (3750, 2)
1 (3750, 2)
2 (3750, 2)
0.19952869415283203
0.14286810159683228
0.10610603541135788
0.0

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  25 out of  25 | elapsed:   47.8s finished


dataset yeast K 2 ISL 0.9576 RF 0.9544 Ridge 0.8552 Lasso 0.8600 Cart 0.9304


In [99]:
train_X.shape

(5400, 21)

In [100]:
train_y.shape

(5400,)

In [102]:
p=0.7
K = len(models)
N = train_X.shape[0]
n = int(p*N)
idx = np.random.choice(N, n, replace=False)
X = train_X[idx]
Y = train_y[idx]

In [107]:
L = compute_K_model_loss(X, Y, models)

0 (3779, 3)
1 (3779, 2)


IndexError: index 2 is out of bounds for axis 0 with size 2

In [103]:
L = compute_K_model_loss(X, Y, models)
W = compute_weights(L, K)

IndexError: index 2 is out of bounds for axis 0 with size 2