In [2]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from data import load_dataset, get_train_test_split
# from optimization import gp_optimize_threshold
# from evaluation import train_and_test_other_models

In [3]:
filename = "dataset/LoanStats3a.csv"
features, data = load_dataset(filename)
print("Data shape: %s" % str(features.shape))

Preprocessing...


Feature Engineering...


Data shape: (42535, 40)


In [13]:
"""
EXPERIMENT 1: GP VS BAYES OPT IN LATER STAGES
-------------------------------------------------------
Set up:
1. Get 100-500 data rows to as training set.
2. Create a GP model and train it with training set.
3. Let K = 100-200.
Experiment:
1. Let GP_model do prediction and self-updating using loan_amount_and_variance version for K more steps. Then pure prediction.
2. Let GP_model do prediction and self-updating using Bayesian optimization for K more steps. Then pure prediction.
3. [OPTIONAL] Let SVM do prediction after training with training set to compare.
Results:
1. Compare the 3 profits gained at the end of all periods.
"""

# Get train test split. Ratio is train:test = 1:9 since Bayesian Optimization is used for this
X_train, X_test, y_train, y_test = get_train_test_split(features, train_size=200, random_state=0)

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Temporarily use subset of data to debug faster
# TODO: Remove
# X_train, y_train = X_train[:100,:], y_train[:100]
# X_val, y_val     = X_val[:500,:], y_val[:500]
# X_test, y_test   = X_test[:500,:], y_test[:500]

# Normalize
X_scaler = MinMaxScaler()
X_scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
# X_val = X_scaler.transform(X_val)
X_test = X_scaler.transform(X_test)

print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))

X_train: (200, 38), y_train: (200,)
X_test: (42335, 38), y_test: (42335,)


In [14]:
# Gaussian Process
import GPy

# Normalize
y_scaler = MinMaxScaler()
y_scaler.fit(y_train.reshape(-1,1))
y_train_scaled = y_scaler.transform(y_train.reshape(-1,1))

# Initialize GP Model
kernel = GPy.kern.RBF(input_dim=X_train.shape[1], variance=1., lengthscale=1.)
gp_model = GPy.models.GPRegression(X_train, y_train_scaled, kernel)
gp_model.optimize()

<paramz.optimization.optimization.opt_lbfgsb at 0x234699baa20>

In [16]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

SEED            = 1
THRESHOLD       = 1.1
NUM_PERIODS     = 20
NUM_MONTHS      = 60
FUND_GIVEN      = 1e6
LOANS_PER_MONTH = 100
CONF_QUANTILE   = (40,100)
NUM_UPDATE_ROWS = 200
from simulation import *

perf_gp = simulate_N_time_periods(
    gp_model.copy(),
    X_test, y_test,
    X_scaler, y_scaler,
    threshold=THRESHOLD,
    num_periods=NUM_PERIODS,
    fund_given=FUND_GIVEN,
    num_months=NUM_MONTHS,
    incoming_loans_per_time_period=LOANS_PER_MONTH,
    conf_quantile=CONF_QUANTILE,
    optimize_for="TODO", 
    version="self_updating_gp",
    gp_update_steps=NUM_UPDATE_ROWS,
    model_type="gp",
    seed=SEED
)
print("Profits for self-updating GP:")
print(np.mean(np.sum(perf_gp[:,:,0], axis=1)))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Simulating period 0...


Simulating period 1...


Simulating period 2...


Simulating period 3...


Simulating period 4...


Simulating period 5...


Simulating period 6...


Simulating period 7...


Simulating period 8...


Simulating period 9...


Simulating period 10...


Simulating period 11...


Simulating period 12...


Simulating period 13...


Simulating period 14...


Simulating period 15...


Simulating period 16...


Simulating period 17...


Simulating period 18...


Simulating period 19...


Profits for self-updating GP:
631576.545695


In [17]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

SEED            = 1
THRESHOLD       = 1.1
NUM_PERIODS     = 20
NUM_MONTHS      = 60
FUND_GIVEN      = 1e6
LOANS_PER_MONTH = 100
CONF_QUANTILE   = (40,100)
NUM_UPDATE_ROWS = 200

from simulation import *

from scipy.stats import norm

kappa = norm.ppf(0.7)
perf_bayes_opt = simulate_N_time_periods(
    gp_model.copy(),
    X_test, y_test,
    X_scaler, y_scaler,
    threshold=THRESHOLD,
    num_periods=NUM_PERIODS,
    fund_given=FUND_GIVEN, 
    num_months=NUM_MONTHS,
    incoming_loans_per_time_period=LOANS_PER_MONTH,
    conf_quantile=CONF_QUANTILE,
    optimize_for="TODO",
    version="bayesian_optimization",
    kappa=kappa,
    bay_opt_steps=200,
    model_type="gp", seed=SEED
)
print("Profits for Bayesian Optimization:")
print(np.mean(np.sum(perf_bayes_opt[:,:,0], axis=1)))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Simulating period 0...
60


59


58


57


56
55


54


53


52


51


50


49


48


47


46


45


44


43


42


41


40


39


38


37


36


35


34


33


32


31


30


29


28


27


26


25


24


23


22


21


20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


Simulating period 1...
60


59
58


57
56


55


54


53


52


51


50
49


48


47
46


45


44


43


42


41


40


39


38


37


36


35


34


33


32


31


30


29


28


27


26


25


24


23


22


21


20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


Simulating period 2...
60


59


58


57
56


55


54
53


52


51


50


49
48


47


46


45


44


43


42


41


40


39


38


37


36


35


34


33


32


31


30


29


28


27


26


25


24


23


22


21


20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


Simulating period 3...
60


59


58
57


56


55


54


53


52


51


50


49


48


47


46


45


44


43


42


41


40


39


38


37


36


35


34


33


32


31


30


29


28


27


26


25


24


23


22


21


20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


Simulating period 4...
60


59


58


57


56


55
54


53


52


51


50


49


48


47


46


45


44


43


42


41


40


39


38


37


36


35


34


33


32


31


30


29


28


27


26


25


24


23


22


21


20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


Simulating period 5...
60


59


58


57


56


55


54


53


52


51


50


49


48


47


46


45


44


43


42


41


40


39


38


37


36


35


34


33


32


31


30


29


28


27


26


25


24


23


22


21


20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


Simulating period 6...
60


59


58


57


56
55


54


53


52


51


50


49


48


47


46


45


44


43


42


41


40


39


38


37


36


35


34


33


32


31


30


29


28


27


26


25


24


23


22


21


20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


Simulating period 7...
60


59


58
57


56


55


54
53


52


51
50


49
48


47


In [None]:
print("Mean Total Profits:")
print(np.mean(np.sum(perf_gp[:,:,0], axis=1)))
print(np.mean(np.sum(perf_others[:,:,0], axis=1)))
print(np.mean(np.sum(perf_bayes_opt[:,:,0], axis=1)))

In [None]:
"""
EXPERIMENT 2: GP VS BAYES OPT IN EARLY STAGES
-------------------------------------------------------
Set up:
1. Let K = 100-500.
2. Create two GP models: gp_model_normal and gp_model_bayes_opt.
3. [OPTIONAL] Create an SVM model.
Experiment:
1. Train gp_model_normal using K randomly chosen data rows. Then let it do pure prediction.
2. Train gp_model_bayes_opt iteratively K steps. Each step explore the row with large acquisition function. Then pure prediction.
3. [OPTIONAL] Train svm_model with the same set in step 1 and let it predict to compare.
4. [OPTIONAL] Train svm_model with the same set in step 2 and let it predict to compare.
Results:
1. Compare the 3 profits gained at the end of all periods.
"""

from sklearn.model_selection import train_test_split
from scipy.stats import norm

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

SEED            = 1
THRESHOLD       = 1.1
NUM_PERIODS     = 20
NUM_MONTHS      = 60
FUND_GIVEN      = 1e6
LOANS_PER_MONTH = 100
CONF_QUANTILE   = (40,100)
NUM_TRAIN_ROWS = 100
KAPPA = norm.ppf(0.7)

In [None]:
# Data initialization
X = features[:,:-2]
y = features[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=NUM_TRAIN_ROWS, random_state=0)

X_train_bo, X_test_bo, y_train_bo, y_test_bo = train_test_split(X, y, train_size=1, random_state=None)

# Normalize X
X_scaler = MinMaxScaler()
X_scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)
X_train_bo = X_scaler.transform(X_train_bo)
X_test_bo = X_scaler.transform(X_test_bo)
X = X_scaler.transform(X)

# Normalize y
y_scaler = MinMaxScaler()
y_scaler.fit(y_train.reshape(-1,1))
y_train_scaled = y_scaler.transform(y_train.reshape(-1,1))
y_train_bo_scaled = y_scaler.transform(y_test_bo.reshape(-1, 1))
y = y_scaler.transform()

print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))
print("X_train_bo: {}, y_train_bo: {}".format(X_train_bo.shape, y_train_bo.shape))
print("X_test_bo: {}, y_test_bo: {}".format(X_test_bo.shape, y_test_bo.shape))
print("y_train_scaled: {}".format(y_train_scaled.shape))
print("y_train_bo_scaled: {}".format(y_train_bo_scaled.shape))
print("X: {}, y: {}".format(X.shape, y.shape))

In [None]:
# Normal GP model init
import GPy
kernel = GPy.kern.RBF(input_dim=X_train.shape[1], variance=1., lengthscale=1.)
gp_model_normal = GPy.models.GPRegression(X_train, y_train_scaled, kernel)
gp_model_normal.optimize()

In [None]:
# Bayesian Optimization GP model init
import GPy
import numpy as np
kernel = GPy.kern.RBF(input_dim=X_train_bo.shape[1], variance=1., lengthscale=1.)
gp_model_bay_opt = GPy.models.GPRegression(X_train_bo, y_train_bo_scaled, kernel)
gp_model_bay_opt.optimize()

for i in range(NUM_TRAIN_ROWS):
    mean, var = gp_model_bay_opt.predict(X_test_bo)
    acquisition = mean + np.sqrt(var)*KAPPA
    next_sample_ind = acquisition.argmax()
    next_sample_X = X_test_bo[next_sample_ind].reshape(1,-1)
    next_sample_y = y_test_bo[next_sample_ind].reshape(1, -1)
    X_train_bo = np.concatenate((X_train_bo, next_sample_X), axis=0)
    y_train_bo = np.concatenate((y_train_bo, next_sample_y), axis=0)
    X_test_bo = np.delete(X_test_bo, next_sample_ind, axis=0)
    y_test_bo = np.delete(y_test_bo, next_sample_ind, axis=0)
    gp_model_bay_opt.set_XY(X=X_train_bo, Y=y_train_bo)
    gp_model_bay_opt.optimize()

In [None]:
# Properly init test dataset
# TODO TODO TODO TODO

In [None]:
from simulation import *

perf_gp = simulate_N_time_periods(
    gp_model_normal,
    X_test, y_test,
    X_scaler, y_scaler,
    threshold=THRESHOLD,
    num_periods=NUM_PERIODS,
    fund_given=FUND_GIVEN,
    num_months=NUM_MONTHS,
    incoming_loans_per_time_period=LOANS_PER_MONTH,
    conf_quantile=CONF_QUANTILE,
    optimize_for="TODO", 
    version="loan_amount_and_variance",
    model_type="gp",
    seed=SEED
)
print("Profits for self-updating GP:")
print(np.mean(np.sum(perf_gp[:,:,0], axis=1)))

In [None]:
from simulation import *

perf_bayes_opt = simulate_N_time_periods(
    gp_model_bay_opt,
    X_test, y_test,
    X_scaler, y_scaler,
    threshold=THRESHOLD,
    num_periods=NUM_PERIODS,
    fund_given=FUND_GIVEN, 
    num_months=NUM_MONTHS,
    incoming_loans_per_time_period=LOANS_PER_MONTH,
    conf_quantile=CONF_QUANTILE,
    optimize_for="TODO",
    version="loan_amount_and_variance",
    model_type="gp", seed=SEED
)
print("Profits for Bayesian Optimization:")
print(np.mean(np.sum(perf_bayes_opt[:,:,0], axis=1)))

In [None]:
# from visualisation import plot_portfolio_performance, plot_portfolio_performance_comparisons
# plot_portfolio_performance_comparisons([perf_gp, perf_others, perf_others], legend_names=["GP", "Others", "Others"])
# plot_portfolio_performance(perf_gp)
# plot_portfolio_performance(perf_others)
# # Optimize threshold for profits / profit_percentage
# threshold = gp_optimize_threshold(gp_model, X_val, y_val, X_scaler, y_scaler, optimize_for="profit_percentage")
# print(threshold)
# train_and_test_other_models(X_train, y_train, X_test, y_test, X_scaler)