In [7]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from data import load_dataset, get_train_test_split
from evaluation import train_and_test_other_models

In [8]:
filename = "dataset/LoanStats3a.csv"
features, data = load_dataset(filename)
print("Data shape: %s" % str(features.shape))

Preprocessing...
Feature Engineering...
Data shape: (42535, 40)


In [72]:
X_train, X_test, y_train, y_test = get_train_test_split(features, test_size=0.3, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Temporarily use subset of data to debug faster
# TODO: Remove
X_train, y_train = X_train[:10000,:], y_train[:10000]
# X_val, y_val     = X_val[:500,:], y_val[:500]
X_test, y_test   = X_test[:500,:], y_test[:500]

# Normalize
X_scaler = MinMaxScaler()
X_scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_val_scaled = X_scaler.transform(X_val)
X_test_scaled = X_scaler.transform(X_test)


print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_val: %s, y_val: %s" % (str(X_val.shape), str(y_val.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))

X_train: (10000, 38), y_train: (10000,)
X_val: (5955, 38), y_val: (5955,)
X_test: (500, 38), y_test: (500,)


In [106]:
from active_learning import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

# Normalize
y_scaler = MinMaxScaler()
y_scaler.fit(y_train.reshape(-1,1))
y_train_scaled = y_scaler.transform(y_train.reshape(-1,1))

kernel = GPy.kern.RBF(input_dim=X_train.shape[1], variance=1., lengthscale=1.)
import time
np.random.seed(int(time.time()))

# normal GP
# gp_model = GPy.models.SparseGPRegression(X_train_scaled, y_train_scaled, kernel)
# gp_model.optimize()

# maximum entropy sampling
# X_sampled, y_sampled = maximum_entropy_sampling(X_train_scaled, y_train_scaled, 100)
# gp_model = GPy.models.GPRegression(X_sampled, y_sampled, kernel)
# gp_model.optimize()

# DARE sampling
# X_sampled, y_sampled = DARE_sampling(X_train_scaled, y_train_scaled, 100, X_scaler) 
# gp_DARE = GPy.models.GPRegression(X_sampled, y_sampled, kernel)
# gp_DARE.optimize()

# sum of mean squared error sampling
# X_sampled, y_sampled = MSE_sampling(X_train_scaled, y_train_scaled, 100)
# gp_model = GPy.models.GPRegression(X_sampled, y_sampled, kernel)
# gp_model.optimize()

# random sampling
# import time
# np.random.seed(int(time.time()))
X_sampled, y_sampled = random_sampling(X_train_scaled, y_train_scaled, 100)
gp_model = GPy.models.GPRegression(X_sampled, y_sampled, kernel)
gp_model.optimize()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<paramz.optimization.optimization.opt_lbfgsb at 0x7f26d4e2d9b0>

In [107]:
from simulation import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

SEED            = int(time.time())
THRESHOLD       = 1.1
NUM_PERIODS     = 20
NUM_MONTHS      = 60
FUND_GIVEN      = 1e6
LOANS_PER_MONTH = 100
CONF_QUANTILE   = (40,100)


perf_gp = simulate_N_time_periods(gp_model, X_val_scaled, y_val, X_scaler, y_scaler, 
                                  threshold=THRESHOLD, num_periods=NUM_PERIODS, fund_given=FUND_GIVEN, 
                                  num_months=NUM_MONTHS,incoming_loans_per_time_period=LOANS_PER_MONTH,
                                  conf_quantile=CONF_QUANTILE, optimize_for="TODO", 
                                  version="loan_amount_and_variance", model_type="gp", seed=SEED)

print("Mean Total Profits:")
print(np.mean(np.sum(perf_gp[:,:,0], axis=1)))

# Saving performances to pickle file and Loading performances from it
filename = os.path.join("results", "perf_gp_MSE")
meta_info = ["GP_MSE"]
performances = [perf_gp]
store_performance_results(performances, meta_info, filename)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Simulating period 0...
Simulating period 10...
Mean Total Profits:
361271.152059
