## Selecting Classification Datasets for Super Learner

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sb
from numpy.linalg import inv

In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import random

In [4]:
from pmlb import fetch_data, classification_dataset_names

In [5]:
list_dataset = []

for dataset in classification_dataset_names:
    X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
    if X.shape[0] >= 5000:
        list_dataset.append(dataset)

In [6]:
len(list_dataset)

28

In [7]:
def get_class_data(dataset):
    X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
    y_min = np.unique(y).min()
    if y_min == 1:
        y -= 1
    return X, y

In [8]:
alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 8, 16, 32, 64, 132]
def other_scores(train_X, test_X, train_y, test_y):
    
    N = train_X.shape[1]
    max_features = np.unique([int(x*N + 1) for x in np.linspace(0.01, 0.99, num = 5)])
    grid = {'max_features': max_features}
    rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', n_jobs = 10)
    rf_cv = GridSearchCV(estimator = rf, param_grid = grid, cv = 5, verbose=2,
                         n_jobs = 2)
    
    lasso  = LogisticRegressionCV(cv=5, penalty='l1',solver = 'saga', random_state=0)
    ridge  = LogisticRegressionCV(cv=5, penalty='l2',solver = 'saga', random_state=0)
    dt = DecisionTreeClassifier(max_depth=5)
    
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)
    
    rf_cv.fit(train_X, train_y)
    lasso.fit(train_X, train_y)
    ridge.fit(train_X, train_y)
    dt.fit(train_X, train_y)
    scores = [x.score(test_X, test_y) for x in [rf_cv, ridge, lasso, dt]]
    return scores

In [9]:
#other_scores(train_X, test_X, train_y, test_y)

## Conditionally interpretable super learner

In [10]:
np.random.seed(23)

In [11]:
alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 8, 16, 32, 64, 132]
class BaseModel:
    def __init__(self, model_type):
        self.model_type = model_type
        self.model = self.create_model()
        if model_type not in range(1,7):
            print("model_type should be in the interval [1, 6]")

    def create_model(self):
        method_name = 'model_' + str(self.model_type)
        method = getattr(self, method_name, lambda: "nothing")
        return method()

    # L1 penalty
    def model_1(self):
        return LogisticRegressionCV(cv=5, penalty='l1',solver = 'saga', random_state=0)

    # l2 penalty
    def model_2(self):
        return LogisticRegressionCV(cv=5, penalty='l2', solver = 'saga', random_state=0)

    # elastic net
    def model_3(self):
        return LogisticRegressionCV(cv=5, penalty='elasticnet', solver = 'saga', l1_ratios=[.5], random_state=0)

    def model_4(self):
        return DecisionTreeClassifier(max_depth=3)

    def model_5(self):
        return DecisionTreeClassifier(max_depth=4)

    def model_6(self):
        return DecisionTreeClassifier(max_depth=5)

In [12]:
def fit_initial_K_models(train_X, train_y, model_types):
    models = []
    N = train_X.shape[0]
    n = int(3*N/np.log(N))
    for k in range(len(model_types)):
        ind = np.random.choice(N, n, replace=False)
        X = train_X[ind]
        y = train_y[ind]
        if len(ind) > 10:
            base_model = BaseModel(model_types[k])
            base_model.model.fit(X, y)
            models.append(base_model)
    return models

In [13]:
def fit_K_models(train_X, train_y, oracle, models,  idx_base, p=0.8):
    # sample to address overfitting 
    N = train_X.shape[0]
    #n = int(p*N)
    #ind = np.random.choice(N, n, replace=False)
    #X = train_X[ind]
    #y = train_y[ind]
    # assigning points using oracle
    # this will be modified 
    x = torch.tensor(train_X).float()
    y_hat = oracle(x.cuda())
    W = F.softmax(0.5*y_hat, dim=1).cpu().detach().numpy()

    model_types = [m.model_type for m in models]
    models = []
    for k in range(len(model_types)):
        w = W[:,k]
        if w.sum()/N > 0.015:
            idx = w > 0.000001
            idx = np.array(list(idx) + idx_base)
            w = W[idx, k].copy()
            X_k = train_X[idx]
            y_k = train_y[idx]
            base_model = BaseModel(model_types[k])
            print("model_type=", model_types[k], k)
            base_model.model.fit(X_k, y_k, w)
            models.append(base_model)
    return models

In [14]:
# L is an array
def compute_K_model_loss(X, y, models):
    L = []
    for i in range(len(models)):
        y_hat = models[i].model.predict_proba(X)
        W = np.eye(y_hat.shape[1])[y] # to avoid the need for num_classes
        loss = (-np.log(y_hat + 1e-8)*W).sum(1)
        L.append(loss)
    L = np.array(L)
    return L

In [15]:
def compute_weights(L, K):
    JI_K = inv(np.ones((K, K)) - np.identity(K))
    W = []
    for i in range(L.shape[1]):
        w_i = np.matmul(JI_K, L[:,i])
        W.append(w_i)
    return np.array(W)

In [16]:
def create_extended_dataset(train_X, train_y, models, p=0.7):
    # sample to address overfitting
    K = len(models)
    N = train_X.shape[0]
    n = int(p*N)
    idx = np.random.choice(N, n, replace=False)
    X = train_X[idx]
    Y = train_y[idx]
    L = compute_K_model_loss(X, Y, models)
    W = compute_weights(L, K)
    X_ext = []
    y_ext = []
    w_ext = []
    for i in range(K):
        X_ext.append(X.copy())
        y_ext.append(i*np.ones(n))
        w_ext.append(W[:, i])
    X_ext = np.concatenate(X_ext, axis=0)
    y_ext = np.concatenate(y_ext, axis=0)
    w_ext = np.concatenate(w_ext, axis=0)
    return X_ext, y_ext, w_ext

In [17]:
def create_oracle_model(D_in, K, N):
    """ Returns an oracle model
    
    The size of the hidden layer is a function of the
    amount of training data
    """
    H = int(2*np.log(N)**2)
    model = nn.Sequential(
        nn.Linear(D_in, H),
        nn.BatchNorm1d(H),
        nn.ReLU(),
        torch.nn.Linear(H, K))
    return model
#nn.Dropout(p=0.2),

In [18]:
def softmax_loss(beta, f_hat, y, w):
    y_hat = np.exp(beta*f_hat)
    den = (np.exp(beta*f_hat)).sum(axis=1)
    y_hat = np.array([y_hat[i]/den[i] for i in range(len(den))])
    loss = w*((y * (1- y_hat)).sum(axis=1))
    return loss.mean()

In [19]:
def bounded_loss(beta, y_hat, y , w):
    #y_hat = beta*y_hat
    y_hat = F.softmax(y_hat, dim=1)
    loss = (y*(1-y_hat)).sum(dim=1)
    return (w*loss).mean()

In [20]:
def train_model(model, train_dl, K, learning_rate = 0.01, epochs=100):
    beta = 1
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.00001)
    KK = epochs//10 + 1
    model.train()
    for t in range(epochs):
        total_loss = 0
        total = 0
        for x, y, w in train_dl:
            x = x.cuda().float()
            y = y.cuda().long()
            w = w.cuda().float()
            y_onehot = torch.FloatTensor(y.shape[0], K).cuda()
            y_onehot.zero_()
            y_onehot = y_onehot.scatter_(1, y.unsqueeze(1), 1)
            y_hat = model(x)
            loss = bounded_loss(beta, y_hat, y_onehot , w)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()*y.size(0)
            total += y.size(0)
        if t % KK == 0: print("epoch %d loss %.4f" % (t, total_loss/total))

In [21]:
def reasign_points(train_X, model):
    x = torch.tensor(train_X).float()
    y_hat = model(x.cuda())
    _, pred = torch.max(y_hat, 1)
    data = {'index': range(len(train_X)), 'group': pred.cpu().numpy()  }
    return pd.DataFrame(data) 

In [22]:
def relabel_groups(groups, models):
    unique_models = groups.group.unique()
    old2new = {x:i for i,x in enumerate(unique_models)}
    ratios = []
    model_types = [models[i].model_type for i in unique_models]
    groups.group = np.array([old2new[x] for x in groups.group.values])
    return groups, model_types

In [23]:
def compute_loss(X, y, oracle, models):
    oracle.eval()
    x = torch.tensor(X).float()
    y = torch.tensor(y).float()
    y_hat = oracle(x.cuda())
    _, ass = torch.max(y_hat, 1)
    preds = []
    ys = []
    for i in range(len(models)):
        xx = x[ass==i]
        yy = y[ass==i]
        if len(xx) > 0:
            pred = models[i].model.predict_proba(xx.cpu().numpy())
            preds.append(pred)
            ys.append(yy.cpu().numpy())
            
    preds = np.concatenate(preds)
    ys = np.concatenate(ys)
    logloss = log_loss(ys, preds)
    acc = (np.argmax(preds, axis=1) == ys).sum()/ys.shape[0]
    return logloss, acc

In [24]:
def compute_single_loss(X, y, model):
    pred = model.model.predict_proba(X)
    logloss = log_loss(y, pred)
    acc = model.model.score(X, y)
    return logloss, acc

In [25]:
def baseline_models(train_X, train_y, valid_X, valid_y):
    best_model = None
    best_valid_acc = 0
    best_model_type = 0
    for k in range(1,7):
        base_model = BaseModel(k)
        base_model.model.fit(train_X, train_y)
        valid_acc = base_model.model.score(valid_X, valid_y)
        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            best_model_type = k
            best_model = base_model.model
    return best_valid_acc, best_model, [best_model_type]

## Loop

In [26]:
def random_assignments(train_X, K=6):
    data = {'index': range(len(train_X)), 'group':  np.random.choice(K, len(train_X)) }
    df = pd.DataFrame(data)
    return df

In [27]:
class OracleDataset(Dataset):
    def __init__(self, X, y, w):
        self.X = X
        self.y = y
        self.w = w
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.w[idx]

In [28]:
def get_optimizer(model, lr = 0.01, wd = 0.0001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [29]:
PATH = Path("/data2/yinterian/tmp/")
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [30]:
def get_class_data(dataset):
    X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
    y_min = np.unique(y).min()
    if y_min == 1:
        y -= 1
    return X, y

In [31]:
def get_datatest_split(dataset, state):
    X, y = get_class_data(dataset)
    train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=state, test_size = 0.3)
    valid_X, test_X, valid_y, test_y = train_test_split(test_X, test_y, random_state=state, test_size =0.5)
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)
    valid_X = scaler.transform(valid_X)
    return train_X, valid_X, test_X, train_y, valid_y, test_y

In [32]:
def get_a_sample_from_each_class(train_X, train_y):
    classes = np.unique(train_y)
    idx_by_class = {c: np.where(train_y == c)[0] for c in classes}
    return [idx_by_class[c][0] for c in classes]

In [33]:
list_datasets = [ 'adult', 'agaricus_lepiota', 'churn', 'clean2', 'magic', 'mushroom', 'phoneme', 'ring',
 'twonorm', 'waveform_21', 'waveform_40']

In [34]:
#churn has a problem i=2 'ann_thyroid' has issiues i=7, i=10
i=8 is long

In [35]:
list_dataset

['adult',
 'agaricus_lepiota',
 'ann_thyroid',
 'churn',
 'clean2',
 'coil2000',
 'connect_4',
 'fars',
 'kddcup',
 'krkopt',
 'letter',
 'magic',
 'mnist',
 'mushroom',
 'nursery',
 'optdigits',
 'page_blocks',
 'pendigits',
 'phoneme',
 'poker',
 'ring',
 'satimage',
 'shuttle',
 'sleep',
 'texture',
 'twonorm',
 'waveform_21',
 'waveform_40']

In [79]:
state = 0
dataset = list_dataset[12]
train_X, valid_X, test_X, train_y, valid_y, test_y = get_datatest_split(dataset, state)

In [80]:
train_X.shape

(49000, 784)

In [None]:
best_valid_acc, best_model, best_model_types = baseline_models(train_X, train_y, valid_X, valid_y)
best_test_acc = best_model.score(test_X, test_y)
best_single_model = best_test_acc
print("best valid acc %.3f best model type %d" % (best_valid_acc, best_model_types[0]))
best_oracle = None
best_models = [best_model]

In [None]:
batch_size = 100000
# number of iterations depends on the number of training points
N = train_X.shape[0]
N_iter = int(3000/np.log(N)**2)
print("Number of training points %d, number iterations %d" % (N, N_iter))

In [None]:
model_types = [x for x in range(1,7)]
K = len(model_types)
INIT_FLAG = True
learning_rate = 0.15
best_train_acc = 0

In [None]:
train_X.shape

In [None]:
idx_base = get_a_sample_from_each_class(train_X, train_y)

In [78]:
for i in range(5):
    
    if not INIT_FLAG:
        models = fit_K_models(train_X, train_y, oracle, models, idx_base, p=0.9)
        if len(models) == 1:
            INIT_FLAG = True
    if INIT_FLAG:
        model_types = [1,1,1,4,5,6,6,6,6,6]
        models = fit_initial_K_models(train_X, train_y, model_types)
        INIT_FLAG = False
    
    K = len(models)
    print("Iteration %d K is %d" % (i+1, K))
    if K == 1:
        INIT_FLAG = True
    
    if not INIT_FLAG:
        X_ext, y_ext, w_ext = create_extended_dataset(train_X, train_y, models, p=0.7)
        train_ds = OracleDataset(X_ext, y_ext, w_ext)
        train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        oracle = create_oracle_model(train_X.shape[1], K, N).cuda()
        train_model(oracle, train_dl, K, learning_rate, N_iter)

    if not INIT_FLAG:
        train_loss, train_acc = compute_loss(train_X, train_y, oracle, models)
        valid_loss, valid_acc = compute_loss(valid_X, valid_y, oracle, models)
        test_loss, test_acc = compute_loss(test_X, test_y, oracle, models)
    
    
    if K == 1:
        models[0].model.fit(train_X, train_y)
        train_loss, train_acc = compute_single_loss(train_X, train_y, models[0])
        test_loss, test_acc = compute_single_loss(test_X, test_y, models[0])
        if train_acc >= best_train_acc:
            best_train_acc = train_acc
            best_test_acc = test_acc
            best_K = K
        break
    
    X_ext, y_ext, w_ext = create_extended_dataset(train_X, train_y, models)
    train_ds = OracleDataset(X_ext, y_ext, w_ext)
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    model = create_oracle_model(train_X.shape[1], K, N).cuda()
    train_model(model, train_dl, K, learning_rate, N_iter)
    groups = reasign_points(train_X, model)
    if len(groups.group.unique()) < K:
        K = len(groups.group.unique()) 
        groups, model_types = relabel_groups(groups, models)
    train_loss, train_acc = compute_loss(train_X, train_y, model, models)
        
    test_loss, test_acc = compute_loss(test_X, test_y, model, models)
    if train_acc >= best_train_acc:
        best_train_acc = train_acc
        best_test_acc = test_acc
        best_K = K
    print("loss", train_loss, test_loss)
    print("Accuracy", train_acc, test_acc)
    print("best test_acc", best_test_acc)
    
scores = other_scores(train_X, test_X, train_y, test_y)
model_str = ["RF", "Ridge", "Lasso", "Cart"]
score_str = ["%s %.4f" % (s, score) for s,score in zip(model_str, scores)]
score_str = " ".join(score_str)
results = "dataset %s K %d ISL %.4f %s"  %(dataset, best_K, best_test_acc, score_str)
print(results)

Iteration 1 K is 10
epoch 0 loss 0.0463
epoch 4 loss 0.0416
epoch 8 loss 0.0399
epoch 12 loss 0.0398
epoch 16 loss 0.0392
epoch 20 loss 0.0388
epoch 24 loss 0.0385
epoch 28 loss 0.0383
epoch 32 loss 0.0386
epoch 0 loss 0.0460
epoch 4 loss 0.0417
epoch 8 loss 0.0406
epoch 12 loss 0.0400
epoch 16 loss 0.0399
epoch 20 loss 0.0395
epoch 24 loss 0.0393
epoch 28 loss 0.0391
epoch 32 loss 0.0389
loss 0.39505179190631023 0.42826165029902513
Accuracy 0.8445996695208051 0.8412197686645636
best test_acc 0.8412197686645636
model_type= 1 0




model_type= 1 2




model_type= 6 5
model_type= 6 7
model_type= 6 8
Iteration 2 K is 5
epoch 0 loss 1.2493
epoch 4 loss 0.1971
epoch 8 loss 0.1971
epoch 12 loss 0.1971
epoch 16 loss 0.1971
epoch 20 loss 0.1971
epoch 24 loss 0.1971
epoch 28 loss 0.1971
epoch 32 loss 0.1971
epoch 0 loss 1.3192
epoch 4 loss 0.1946
epoch 8 loss 0.1946
epoch 12 loss 0.1946
epoch 16 loss 0.1946
epoch 20 loss 0.1946
epoch 24 loss 0.1946
epoch 28 loss 0.1946
epoch 32 loss 0.1946
loss 0.9824008775006432 0.9751893331075452
Accuracy 0.6491662911221271 0.6519453207150369
best test_acc 0.8412197686645636
model_type= 1 0




Iteration 3 K is 10
epoch 0 loss 0.0440
epoch 4 loss 0.0400
epoch 8 loss 0.0390
epoch 12 loss 0.0385
epoch 16 loss 0.0384
epoch 20 loss 0.0384
epoch 24 loss 0.0381
epoch 28 loss 0.0379
epoch 32 loss 0.0380
epoch 0 loss 0.0445
epoch 4 loss 0.0393
epoch 8 loss 0.0382
epoch 12 loss 0.0380
epoch 16 loss 0.0375
epoch 20 loss 0.0373
epoch 24 loss 0.0375
epoch 28 loss 0.0373
epoch 32 loss 0.0369
loss 0.3748272541732972 0.38869545835796626
Accuracy 0.8476791347453808 0.8398177357167893
best test_acc 0.8398177357167893
model_type= 5 4
model_type= 6 5
model_type= 6 6
model_type= 6 8
Iteration 4 K is 4
epoch 0 loss 2.3410
epoch 4 loss 1.4851
epoch 8 loss 1.4445
epoch 12 loss 1.4093
epoch 16 loss 1.3895
epoch 20 loss 1.3690
epoch 24 loss 1.3467
epoch 28 loss 1.3325
epoch 32 loss 1.3192
epoch 0 loss 2.3673
epoch 4 loss 1.4951
epoch 8 loss 1.4277
epoch 12 loss 1.4050
epoch 16 loss 1.3826
epoch 20 loss 1.3616
epoch 24 loss 1.3540
epoch 28 loss 1.3295
epoch 32 loss 1.3292
loss 10.067972899853416 9.975

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  25 out of  25 | elapsed:  1.8min finished


dataset magic K 8 ISL 0.8398 RF 0.8829 Ridge 0.7883 Lasso 0.7883 Cart 0.8360


dataset adult K 3 ISL 0.8534 RF 0.8612 Ridge 0.8250 Lasso 0.8252 Cart 0.8525
dataset agaricus_lepiota K 3 ISL 0.9959 RF 1.0000 Ridge 0.9615 Lasso 0.9615 Cart 0.9795
dataset ann_thyroid K 2 ISL 0.9954 RF 0.9972 Ridge 0.9546 Lasso 0.9546 Cart 0.9981
dataset churn K 6 ISL 0.9480 RF 0.9667 Ridge 0.8707 Lasso 0.8707 Cart 0.9373
dataset clean2 K 1 ISL 1.0000 RF 1.0000 Ridge 1.0000 Lasso 1.0000 Cart 1.0000
dataset coil2000 K 3 ISL 0.9362 RF 0.9261 Ridge 0.9383 Lasso 0.9383 Cart 0.9362
dataset connect_4 K 5 ISL 0.7068 RF 0.8170 Ridge 0.6649 Lasso 0.6649 Cart 0.6805
dataset krkopt K 3 ISL 0.3488 RF 0.8491 Ridge 0.2837 Lasso 0.2818 Cart 0.3443
dataset krkopt K 5 ISL 0.3699 RF 0.8475 Ridge 0.2837 Lasso 0.2818 Cart 0.3443
dataset magic K 8 ISL 0.8398 RF 0.8829 Ridge 0.7883 Lasso 0.7883 Cart 0.8360
