In [1]:
import sys
sys.path.append('/cluster/sj1')

In [2]:
%load_ext autoreload
%autoreload 2

import torch
import pyro
import numpy as np
from scipy.stats import kendalltau
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from bb_opt.src.bayesian_opt import (
    optimize,
    get_model_bnn,
    train_model_bnn,
    partial_train_model_bnn,
    train,
    bnn_predict
)
from bb_opt.src.utils import get_path, save_pyro_model, load_pyro_model
from bb_opt.src import hsic as hsic
from bb_opt.src import knn_mi as knn_mi
from gpu_utils.utils import gpu_init
from tqdm import tnrange

gpu_id = gpu_init()
print(f"Running on GPU {gpu_id}")
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

%matplotlib inline

  from numpy.core.umath_tests import inner1d


Running on GPU 0


In [3]:
n_train = 1000

project = "dna_binding"
dataset = "crx_ref_r1"
batch_size = 1
retrain_every = 1000000
partial_steps = 20

root = "/cluster/sj1/bb_opt/"
data_dir = get_path(root, "data", project, dataset)
inputs = np.load(get_path(data_dir, "inputs.npy"))
labels = np.load(get_path(data_dir, "labels.npy"))
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, labels, train_size=n_train, random_state=521)
train_inputs, val_inputs, train_labels, val_labels = train_test_split(train_inputs, train_labels, train_size=0.9, random_state=521)

top_k_percent = 1



In [4]:
train_label_mean = train_labels.mean()
train_label_std = train_labels.std()

train_labels = (train_labels - train_label_mean) / train_label_std
val_labels = (val_labels - train_label_mean) / train_label_std
test_labels = (test_labels - train_label_mean) / train_label_std

In [6]:
import forestci as fci

In [7]:
n_trees = 2000
rf = RandomForestRegressor(n_estimators=n_trees, random_state=42)
rf.fit(train_inputs, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [8]:
y_hat = rf.predict(test_inputs)

In [9]:
def jointplot(predicted, true, title: str=""):
    if isinstance(predicted, torch.Tensor):
        predicted = predicted.detach().cpu().numpy()

    if isinstance(true, torch.Tensor):
        true = true.detach().cpu().numpy()
        
    ax = sns.jointplot(predicted, true, s=3, alpha=0.5)
    #ax = sns.regplot(predicted, true)
    ax.set_axis_labels('Predicted', 'True')
    ax.ax_marg_x.set_title(title)
    return ax

In [21]:
preds = []
e_idx = np.arange(len(rf.estimators_))
np.random.shuffle(e_idx)
n_samples = 100
for i in range(n_samples):
    print(i)
    rfe = rf.estimators_[e_idx[i]]
    preds += [rf.predict(inputs)]
preds = np.array(preds)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [22]:
preds.shape

(100, 32896)

In [24]:
max_pred_idx = preds.argmax(axis=1)

In [25]:
max_pred_idx

array([13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242,
       13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242,
       13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242,
       13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242,
       13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242,
       13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242,
       13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242,
       13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242,
       13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242,
       13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242,
       13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242, 13242,
       13242])

In [30]:
sort_idx = labels.argsort()

In [26]:
labels[13242]

258252.8

In [31]:
np.where(sort_idx == 13242)

(array([32856]),)

In [32]:
labels.shape

(32896,)

In [34]:
labels[26348]

100379.08

In [37]:
labels[sort_idx][-100:-80]

array([199452.83, 200311.19, 200572.42, 201128.14, 201244.64, 201828.4 ,
       202365.8 , 202739.2 , 203029.5 , 203229.56, 203378.75, 204484.38,
       204876.72, 206875.22, 207043.4 , 207944.84, 208545.97, 208556.84,
       209676.31, 209717.58], dtype=float32)