# Super Learner

## Preliminary experiments

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sb
from numpy.linalg import inv

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import random

In [4]:
from pmlb import fetch_data, regression_dataset_names

## Conditionally interpretable super learner

In [5]:
def random_assignments(train_X, K=6):
    data = {'index': range(len(train_X)), 'group':  np.random.choice(K, len(train_X)) }
    df = pd.DataFrame(data)
    return df


alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 8, 16, 32, 64, 132]
class BaseModel:
    def __init__(self, model_type):
        self.model_type = model_type
        self.model = self.create_model()
        if model_type not in range(1,7):
            print("model_type should be in the interval [1, 6]")

    def create_model(self):
        method_name = 'model_' + str(self.model_type)
        method = getattr(self, method_name, lambda: "nothing")
        return method()

    def model_1(self):
        return RidgeCV(cv=5, alphas=alphas)

    def model_2(self):
        return ElasticNetCV(cv=5, random_state=0, l1_ratio=0.5)

    def model_3(self):
        return ElasticNetCV(cv=5, random_state=0, l1_ratio=1)
    # max_features=0.9
    def model_4(self):
        return DecisionTreeRegressor(max_depth=4)

    def model_5(self):
        return DecisionTreeRegressor(max_depth=5)

    def model_6(self):
        return DecisionTreeRegressor(max_depth=6)


In [6]:
def create_base_model(train_X, train_y, m_type):
    N = train_X.shape[0]
    n = int(2.5*N/np.log(N))
    ind = np.random.choice(N, n, replace=False)
    X = train_X[ind]
    y = train_y[ind]
    base_model = BaseModel(m_type)
    base_model.model.fit(X, y)
    return base_model

In [43]:
def fit_initial_K_models(train_X, train_y, model_types):
    models = []
    N = train_X.shape[0]
    #n = int(3*N/np.log(N))
    n = int(N/np.log(N))
    for k in range(len(model_types)):
        ind = np.random.choice(N, n, replace=False)
        X = train_X[ind]
        y = train_y[ind]
        if len(ind) > 10:
            base_model = BaseModel(model_types[k])
            base_model.model.fit(X, y)
            models.append(base_model)
    return models

In [8]:
alphas=[1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 8, 16, 32, 64, 132]

def fit_K_models(train_X, train_y, oracle, models, p=0.8):
    # sample to address overfitting 
    N = train_X.shape[0]
    n = int(p*N)
    ind = np.random.choice(N, n, replace=False)
    X = train_X[ind]
    y = train_y[ind]
    # assigning points using oracle
    # this will be modified 
    oracle.eval()
    x = torch.tensor(X).float()
    y_hat = oracle(x.cuda())
    W = F.softmax(0.1*y_hat, dim=1).cpu().detach().numpy()
                
    model_types = [m.model_type for m in models]
    models = []
    for k in range(len(model_types)):
        w = W[:,k]
        if w.sum()/n > 0.015:
            idx = w > 0.000001
            w = W[idx, k].copy() 
            X_k = X[idx]
            y_k = y[idx]
            print("model_type", model_types[k])
            base_model = BaseModel(model_types[k])
            base_model.model.fit(X_k, y_k, w)
            models.append(base_model)
    return models

In [9]:
def compute_K_model_loss(train_X, train_y, models):
    L = []
    for i in range(len(models)):
        loss = (models[i].model.predict(train_X) - train_y)**2
        L.append(loss)
    L = np.array(L)
    return L

In [10]:
def compute_weights(L, K):
    JI_K = inv(np.ones((K, K)) - np.identity(K))
    W = []
    for i in range(L.shape[1]):
        w_i = np.matmul(JI_K, L[:,i])
        W.append(w_i)
    return np.array(W)

In [11]:
def create_extended_dataset(train_X, train_y, models, p=0.7):
    # sample to address overfitting
    K = len(models)
    N = train_X.shape[0]
    n = int(p*N)
    idx = np.random.choice(N, n, replace=False)
    X = train_X[idx]
    Y = train_y[idx]
    L = compute_K_model_loss(X, Y, models)
    W = compute_weights(L, K)
    X_ext = []
    y_ext = []
    w_ext = []
    for i in range(K):
        X_ext.append(X.copy())
        y_ext.append(i*np.ones(n))
        w_ext.append(W[:, i])
    X_ext = np.concatenate(X_ext, axis=0)
    y_ext = np.concatenate(y_ext, axis=0)
    w_ext = np.concatenate(w_ext, axis=0)
    return X_ext, y_ext, w_ext

## Neural Network oracle

In [12]:
def create_oracle_model(D_in, K, N):
    """ Returns an oracle model
    
    The size of the hidden layer is a function of the
    amount of training data
    """
    H = np.minimum(int(2*np.log(N)**2), 150)
    model = nn.Sequential(
        nn.Linear(D_in, H),
        nn.BatchNorm1d(H),
        nn.ReLU(),
        torch.nn.Linear(H, K))
    return model

In [13]:
def softmax_loss(beta, f_hat, y, w):
    y_hat = torch.exp(beta*f_hat)
    den = y_hat.sum(1)
    y_hat = y_hat/den.view(den.size(0),1)
    loss = w.dot((y * (1- y_hat)).sum(1))
    return loss

In [14]:
def bounded_loss(beta, y_hat, y , w):
    #y_hat = beta*y_hat
    y_hat = F.softmax(y_hat, dim=1)
    loss = (y*(1-y_hat)).sum(dim=1)
    return (w*loss).mean()

In [15]:
def train_model(model, train_dl, K, learning_rate = 0.01, epochs=100):
    beta = 1
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    KK = epochs//10 + 1
    model.train()
    for t in range(epochs):
        total_loss = 0
        total = 0
        for x, y, w in train_dl:
            x = x.cuda().float()
            y = y.cuda().long()
            w = w.cuda().float()
            y_onehot = torch.FloatTensor(y.shape[0], K).cuda()
            y_onehot.zero_()
            y_onehot = y_onehot.scatter_(1, y.unsqueeze(1), 1)
            y_hat = model(x)
            loss = bounded_loss(beta, y_hat, y_onehot , w)
            #loss = softmax_loss(beta, y_hat, y_onehot , w)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()*y.size(0)
            total += y.size(0)
        if t % KK == 0: print("epoch %d loss %.4f" % (t, total_loss/total))

In [16]:
from sklearn.metrics import r2_score

In [17]:
def compute_loss(X, y, oracle, models):
    oracle.eval()
    x = torch.tensor(X).float()
    y = torch.tensor(y).float()
    y_hat = oracle(x.cuda())
    _, ass = torch.max(y_hat, 1)
    preds = []
    ys = []
    k = 0
    #print(ass)
    for i in range(len(models)):
        xx = x[ass==i]
        yy = y[ass==i]
        if len(xx) > 0:
            k =+1
            pred = models[i].model.predict(xx.cpu().numpy())
            preds.append(pred)
            ys.append(yy.cpu().numpy())

    if k==1:
        preds, ys = preds[0], ys[0]
    else:
        preds = np.hstack(preds)
        ys = np.hstack(ys)
    r2 = r2_score(ys, preds)
    res = (ys - preds)**2
    return res.mean(), r2

In [18]:
def compute_single_loss(X, y, model):
    pred = model.model.predict(X)
    r2 = r2_score(y, pred)
    res = (y - pred)**2
    return res.mean(), r2

In [19]:
class OracleDataset(Dataset):
    def __init__(self, X, y, w):
        self.X = X
        self.y = y
        self.w = w
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], self.w[idx]

In [20]:
def get_optimizer(model, lr = 0.01, wd = 0.0001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [21]:
def baseline_models(train_X, train_y, valid_X, valid_y):
    best_model = None
    best_valid_r2 = 0
    best_model_type = 0
    for k in range(1,7):
        base_model = BaseModel(k)
        base_model.model.fit(train_X, train_y)
        valid_r2 = base_model.model.score(valid_X, valid_y)
        if valid_r2 > best_valid_r2:
            best_valid_r2 = valid_r2
            best_model_type = k
            best_model = base_model.model
    return best_valid_r2, best_model, [best_model_type]

## Learning rate finder

In [22]:
PATH = Path("/data2/yinterian/tmp/")
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

def LR_range_finder(model, train_dl, K, lr_low=1e-5, lr_high=1, epochs=10):
    losses = []
    p = PATH/"mode_tmp.pth"
    save_model(model, str(p))
    iterations = epochs * len(train_dl)
    delta = (lr_high - lr_low)/iterations
    lrs = [lr_low + i*delta for i in range(iterations)]
    model.train()
    ind = 0
    for i in range(epochs):
        for x, y, w in train_dl:
            optimizer = get_optimizer(model, lr=lrs[ind])
            x = x.cuda().float()
            y = y.cuda().long()
            w = w.cuda().float()
            y_onehot = torch.FloatTensor(y.shape[0], K).cuda()
            y_onehot.zero_()
            y_onehot = y_onehot.scatter_(1, y.unsqueeze(1), 1)
            y_hat = model(x)
            loss = bounded_loss(beta, y_hat, y_onehot , w)
       
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            ind +=1
            losses.append(loss.item())
            
    load_model(model, str(p))
    return lrs, losses

In [23]:
def oracle_LR_range_finder(train_X, train_y, K=6):
    groups = random_assignments(train_X, K)
    models = fit_K_models(train_X, train_y, groups, model_types)
    K = len(models)
    print("models")
    X_ext, y_ext, w_ext = create_extended_dataset(train_X, train_y, models)
    train_ds = OracleDataset(X_ext, y_ext, w_ext)
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    model = create_oracle_model(train_X.shape[1], K, N).cuda()
    lrs, losses = LR_range_finder(model, train_dl, K, lr_low=1e-5, lr_high=0.5)
    return lrs, losses

In [24]:
dataset = "1028_SWD"
X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
train_X, test_X, train_y, test_y = train_test_split(X, y)
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

In [25]:
X.shape

(1000, 10)

## Main 

In [26]:
lr_map = {"1028_SWD": 0.15, "1029_LEV" :0.15, "1030_ERA": 0.15, "1191_BNG_pbc": 0.02,
         "1193_BNG_lowbwt": 0.1, "1196_BNG_pharynx": 0.015, "1199_BNG_echoMonths": 0.3,
         "1203_BNG_pwLinear": 0.05, "1595_poker": 0.01, "1201_BNG_breastTumor": 0.05, "197_cpu_act": 0.2,
         "201_pol": 0.15, "215_2dplanes": 0.1, "218_house_8L": 0.05, "225_puma8NH": 0.15,
         "227_cpu_small":0.15, "294_satellite_image": 0.15, "344_mv": 0.1,
          "4544_GeographicalOriginalofMusic": 0.15, "503_wind": 0.1, "529_pollen": 0.1,
         "537_houses": 0.15, "562_cpu_small": 0.15, "564_fried": 0.1, "573_cpu_act": 0.15,
         "574_house_16H": 0.15, "583_fri_c1_1000_50": 0.15, "586_fri_c3_1000_25": 0.15 }

In [29]:
# difficult problems 5, 7, 22
dataset = "1201_BNG_breastTumor"
dataset = "1199_BNG_echoMonths"

state=2
X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=state, test_size = 0.2)
valid_X, test_X, valid_y, test_y = train_test_split(test_X, test_y, random_state=state,
                                                            test_size =0.5) 
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
valid_X = scaler.transform(valid_X)
test_X = scaler.transform(test_X)
print(dataset, train_X.shape)

1199_BNG_echoMonths (13996, 9)


In [30]:
best_valid_r2, best_model, best_model_types = baseline_models(train_X, train_y,
                                                              valid_X, valid_y)
best_test_r2 = best_model.score(test_X, test_y)

In [31]:
best_valid_r2, best_model, best_model_types, best_test_r2

(0.458727912003307,
 DecisionTreeRegressor(criterion='mse', max_depth=6, max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
 [6],
 0.4463650585806125)

In [27]:
def get_datatest_split(dataset, state, X=None, y=None):
    #if X is None:
    #    X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='/data2/yinterian/pmlb/')
    train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=state, test_size = 0.3)
    valid_X, test_X, valid_y, test_y = train_test_split(test_X, test_y, random_state=state, test_size =0.5)
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    test_X = scaler.transform(test_X)
    valid_X = scaler.transform(valid_X)
    return train_X, valid_X, test_X, train_y, valid_y, test_y

## Main loop

In [39]:
def main_loop(state, dataset, X, Y):
    
    learning_rate = lr_map.get(dataset, 0.15)
    train_X, valid_X, test_X, train_y, valid_y, test_y = get_datatest_split(
        dataset, state, X, Y)

    best_valid_r2, best_model, best_model_types = baseline_models(
        train_X, train_y, valid_X, valid_y)
    best_test_r2 = best_model.score(test_X, test_y)
    print("best valid R^2 %.3f best model type %d" % (best_valid_r2, best_model_types[0]))
    best_oracle = None
    best_models = [best_model] 


    batch_size = 100000
    # number of iterations depends on the number of training points
    N = train_X.shape[0]
    N_iter = int(3000/np.log(N)**2)
    print("Number of training points %d, number iterations %d" % (N, N_iter))

    INIT_FLAG = True
    oracle = None
    for i in range(16):
        if i == 7: INIT_FLAG = True
            
        if not INIT_FLAG:
            models = fit_K_models(train_X, train_y, oracle, models, p=0.9)
            if len(models) == 1:
                INIT_FLAG = True  
            
        if INIT_FLAG:
            #model_types = [1,4,5,6] + [1,1,1,1,1,1,1]
            model_types = [1,1,1,1,1,1,1,1,1]
            models = fit_initial_K_models(train_X, train_y, model_types)
            INIT_FLAG = False
            
        K = len(models)
        print("Iteration %d K is %d" % (i+1, K))
        if K == 1:
            INIT_FLAG = True

        if not INIT_FLAG:
            X_ext, y_ext, w_ext = create_extended_dataset(train_X, train_y, models, p=0.9)
            train_ds = OracleDataset(X_ext, y_ext, w_ext)
            train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
            oracle = create_oracle_model(train_X.shape[1], K, N).cuda()
            train_model(oracle, train_dl, K, learning_rate, N_iter)
            
            
        if not INIT_FLAG:
            train_loss, train_r2 = compute_loss(train_X, train_y, oracle, models)
            valid_loss, valid_r2 = compute_loss(valid_X, valid_y, oracle, models)
            test_loss, test_r2 = compute_loss(test_X, test_y, oracle, models)


        print("train loss %.3f valid loss %.3f", train_loss, valid_loss)
        print("train R^2 %.3f valid R^2 %.3f", train_r2, valid_r2)
        if valid_r2 >= best_valid_r2:
            best_train_r2 = train_r2
            best_valid_r2 = valid_r2
            best_K = K
            best_models = models
            best_model_types = [m.model_type for m in models]
            best_test_r2 = test_r2 
        
    results = "dataset %s state %d K %d test ISL %.3f valid ISL %.3f model_types %s" % (
        dataset, state, len(best_models), best_test_r2, best_valid_r2,
        str(best_model_types))
    print(results)
    #f.write(results)
    #f.write('\n')
    #f.flush()

In [37]:
#for dataset in selected_datasets:
#    main_loop(1, dataset)

In [40]:
#main_loop(1, "1199_BNG_echoMonths", X=None, Y=None)

## Experiment with synthetic data

In [40]:
PATH = Path("data-synt/")
basefilename = "synthetic-564_fried-K-3"
def get_synt_XY(seed):
    dataset = basefilename + "-" + str(seed)
    filename = dataset + ".csv"
    print(filename)
    data = pd.read_csv(PATH/filename)
    X = data.iloc[:,:-1].values
    Y = data.iloc[:,-1].values
    return X, Y

In [31]:
main_loop(state, "564_fried-11", X, Y)

best valid R^2 0.492 best model type 6
Number of training points 28537, number iterations 28
Iteration 1 K is 11
epoch 0 loss 159.6470
epoch 3 loss 128.6147
epoch 6 loss 121.8812
epoch 9 loss 120.0795
epoch 12 loss 119.0193
epoch 15 loss 118.1722
epoch 18 loss 116.9677
epoch 21 loss 116.8229
epoch 24 loss 116.5414
epoch 27 loss 115.3359
train loss %.3f valid loss %.3f 986.016060869837 996.6615807076531
train R^2 %.3f valid R^2 %.3f 0.6795619125090879 0.6782215783195711
model_type 1
model_type 5
model_type 6
model_type 1
model_type 1
model_type 1
Iteration 2 K is 6
epoch 0 loss 440.6181
epoch 3 loss 168.8590
epoch 6 loss 158.8939
epoch 9 loss 155.2604
epoch 12 loss 152.4157
epoch 15 loss 150.8514
epoch 18 loss 151.0476
epoch 21 loss 150.9909
epoch 24 loss 150.2794
epoch 27 loss 148.3971
train loss %.3f valid loss %.3f 1142.696038060095 1081.6959526890205
train R^2 %.3f valid R^2 %.3f 0.2312647861531567 0.26484317190867857
model_type 1
model_type 5
model_type 6
model_type 1
model_type 1


In [33]:
X, Y = get_synt_XY(2)
main_loop(state, "564_fried-11", X, Y)

best valid R^2 0.672 best model type 3
Number of training points 28537, number iterations 28
Iteration 1 K is 11
epoch 0 loss 90.6913
epoch 3 loss 78.7550
epoch 6 loss 77.7886
epoch 9 loss 77.8669
epoch 12 loss 77.4712
epoch 15 loss 77.4955
epoch 18 loss 77.0178
epoch 21 loss 76.7184
epoch 24 loss 77.0541
epoch 27 loss 76.7007
train loss %.3f valid loss %.3f 901.4564391605348 883.1897343379358
train R^2 %.3f valid R^2 %.3f 0.5769682553072351 0.5754005577693924
model_type 1
model_type 6
model_type 1
model_type 1
model_type 1
model_type 1
Iteration 2 K is 6
epoch 0 loss 269.7120
epoch 3 loss 159.7840
epoch 6 loss 148.0936
epoch 9 loss 143.1050
epoch 12 loss 139.9870
epoch 15 loss 137.9201
epoch 18 loss 136.9812
epoch 21 loss 136.3374
epoch 24 loss 135.9352
epoch 27 loss 135.8736
train loss %.3f valid loss %.3f 445.9260786728368 437.7405676791932
train R^2 %.3f valid R^2 %.3f 0.7514242704424305 0.7649719942629499
model_type 1
model_type 1
model_type 1
model_type 1
Iteration 3 K is 4
epoch

In [36]:
N = X.shape[0]
int(N/np.log(N))

3840

In [42]:
X, Y = get_synt_XY(4)
main_loop(state, "564_fried-11", X, Y)

synthetic-564_fried-K-3-4.csv
best valid R^2 0.511 best model type 6
Number of training points 28537, number iterations 28
Iteration 1 K is 9
epoch 0 loss 202.4867
epoch 3 loss 196.6693
epoch 6 loss 195.6244
epoch 9 loss 195.1122
epoch 12 loss 194.7483
epoch 15 loss 195.0858
epoch 18 loss 194.7389
epoch 21 loss 194.8023
epoch 24 loss 194.9170
epoch 27 loss 194.7451
train loss %.3f valid loss %.3f 1808.8721652508602 1702.8029311198065
train R^2 %.3f valid R^2 %.3f 0.11568930390905852 0.09099309246418008
model_type 1
model_type 1
model_type 1
model_type 1
model_type 1
model_type 1
Iteration 2 K is 6
epoch 0 loss 527.0320
epoch 3 loss 243.8994
epoch 6 loss 228.9233
epoch 9 loss 217.8047
epoch 12 loss 213.7023
epoch 15 loss 210.1472
epoch 18 loss 207.3158
epoch 21 loss 206.3058
epoch 24 loss 203.8040
epoch 27 loss 203.0819
train loss %.3f valid loss %.3f 4094.2148405134867 3906.44207830996
train R^2 %.3f valid R^2 %.3f -0.6086724650502302 -0.5926680899431609
model_type 1
model_type 1
model

In [44]:
X, Y = get_synt_XY(4)
main_loop(state, "564_fried-11", X, Y)

synthetic-564_fried-K-3-4.csv
best valid R^2 0.511 best model type 6
Number of training points 28537, number iterations 28
Iteration 1 K is 9
epoch 0 loss 200.5816
epoch 3 loss 190.2011
epoch 6 loss 189.3114
epoch 9 loss 189.0906
epoch 12 loss 188.4948
epoch 15 loss 188.1979
epoch 18 loss 188.0839
epoch 21 loss 188.3853
epoch 24 loss 187.7090
epoch 27 loss 187.7711
train loss %.3f valid loss %.3f 741.4408192469216 829.8067008046531
train R^2 %.3f valid R^2 %.3f -0.541165823941602 -0.6879594622426848
model_type 1
model_type 1
model_type 1
model_type 1
model_type 1
Iteration 2 K is 5
epoch 0 loss 752.0919
epoch 3 loss 333.7803
epoch 6 loss 159.2022
epoch 9 loss 118.6298
epoch 12 loss 88.7374
epoch 15 loss 85.8998
epoch 18 loss 77.5962
epoch 21 loss 72.0734
epoch 24 loss 72.9031
epoch 27 loss 71.2982
train loss %.3f valid loss %.3f 0.0744652255402327 110.63733906255983
train R^2 %.3f valid R^2 %.3f 0.0 0.8702686443958518
model_type 1
model_type 1
model_type 1
model_type 1
Iteration 3 K is

In [45]:
X, Y = get_synt_XY(3)
main_loop(state, "564_fried-11", X, Y)

synthetic-564_fried-K-3-3.csv
best valid R^2 0.400 best model type 6
Number of training points 28537, number iterations 28
Iteration 1 K is 9
epoch 0 loss 312.0765
epoch 3 loss 295.1127
epoch 6 loss 294.9496
epoch 9 loss 294.8005
epoch 12 loss 294.8803
epoch 15 loss 294.6771
epoch 18 loss 294.3763
epoch 21 loss 293.9834
epoch 24 loss 293.8408
epoch 27 loss 294.2507
train loss %.3f valid loss %.3f 3613.3218250798895 3693.812912765388
train R^2 %.3f valid R^2 %.3f 0.3777444299339089 0.3596971435769899
model_type 1
model_type 1
model_type 1
model_type 1
Iteration 2 K is 4
epoch 0 loss 1746.4474
epoch 3 loss 928.5338
epoch 6 loss 926.8498
epoch 9 loss 929.6868
epoch 12 loss 927.5105
epoch 15 loss 927.5547
epoch 18 loss 927.2548
epoch 21 loss 926.1891
epoch 24 loss 926.3034
epoch 27 loss 927.1061
train loss %.3f valid loss %.3f 4465.728334087338 4410.02454140509
train R^2 %.3f valid R^2 %.3f -0.24170080596489307 -0.2912697980781622
model_type 1
model_type 1
Iteration 3 K is 2
epoch 0 loss 1

In [59]:
import xgboost as xgb
import datetime as dt

In [68]:
xgb_pars = {'min_child_weight': 50, 'eta': 0.01, 'colsample_bytree': 0.3, 'max_depth': 10,
            'subsample': 0.8, 'lambda': 1., 'nthread': -1, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear'}

In [69]:
train_X, valid_X, test_X, train_y, valid_y, test_y = get_datatest_split(
        dataset, state, X, Y)

In [70]:
dtrain = xgb.DMatrix(train_X, label=train_y)
dvalid = xgb.DMatrix(valid_X, label=valid_y)
dtest = xgb.DMatrix(test_X, label=test_y)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [71]:
t0 = dt.datetime.now()
gbm = xgb.train(xgb_pars, dtrain, 1000, watchlist, early_stopping_rounds=50,
                maximize=False, verbose_eval=10)
t1 = dt.datetime.now()
print('Time fitting xgb: %i seconds' % (t1 - t0).seconds)

[0]	train-rmse:3406.83	valid-rmse:3404.82
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[10]	train-rmse:3081.2	valid-rmse:3079.19
[20]	train-rmse:2786.71	valid-rmse:2784.7
[30]	train-rmse:2520.38	valid-rmse:2518.39
[40]	train-rmse:2279.52	valid-rmse:2277.54
[50]	train-rmse:2061.71	valid-rmse:2059.74
[60]	train-rmse:1864.74	valid-rmse:1862.77
[70]	train-rmse:1686.61	valid-rmse:1684.65
[80]	train-rmse:1525.53	valid-rmse:1523.58
[90]	train-rmse:1379.87	valid-rmse:1377.93
[100]	train-rmse:1248.15	valid-rmse:1246.21
[110]	train-rmse:1129.03	valid-rmse:1127.1
[120]	train-rmse:1021.34	valid-rmse:1019.41
[130]	train-rmse:923.96	valid-rmse:922.041
[140]	train-rmse:835.91	valid-rmse:833.994
[150]	train-rmse:756.306	valid-rmse:754.4
[160]	train-rmse:684.339	valid-rmse:682.442
[170]	train-rmse:619.278	valid-rmse:617.391
[180]	train-rmse:560.474	valid-rmse:558.603
[190]	train-rmse:507.316	valid-rmse:5

In [72]:
ypred = gbm.predict(dtest)

In [73]:
ypred.shape

(6116,)

In [74]:
r2 = r2_score(test_y, ypred)
r2

0.7495334369131217