In [10]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from data import load_dataset, get_train_test_split
from optimization import gp_optimize_threshold
from evaluation import train_and_test_other_models

In [11]:
filename = "dataset/LoanStats3a.csv"
features, data = load_dataset(filename)
print("Data shape: %s" % str(features.shape))

Preprocessing...


Feature Engineering...


Data shape: (42535, 40)


In [12]:
X_train, X_test, y_train, y_test = get_train_test_split(features, test_size=0.3, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Temporarily use subset of data to debug faster
# TODO: Remove
X_train, y_train = X_train[:1000,:], y_train[:1000]
X_val, y_val     = X_val[:500,:], y_val[:500]
X_test, y_test   = X_test[:500,:], y_test[:500]

# Normalize
X_scaler = MinMaxScaler()
X_scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
X_val = X_scaler.transform(X_val)
X_test = X_scaler.transform(X_test)

print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_val: %s, y_val: %s" % (str(X_val.shape), str(y_val.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))

X_train: (1000, 38), y_train: (1000,)
X_val: (500, 38), y_val: (500,)
X_test: (500, 38), y_test: (500,)


In [13]:
# Gaussian Process
import GPy

# Normalize
y_scaler = MinMaxScaler()
y_scaler.fit(y_train.reshape(-1,1))
y_train_scaled = y_scaler.transform(y_train.reshape(-1,1))

# Initialize GP Model
kernel = GPy.kern.RBF(input_dim=X_train.shape[1], variance=1., lengthscale=1.)
gp_model = GPy.models.GPRegression(X_train, y_train_scaled, kernel)
gp_model.optimize()


<paramz.optimization.optimization.opt_lbfgsb at 0x17d0f5f6278>

In [17]:
from simulation import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

SEED            = 1
THRESHOLD       = 1.1
NUM_PERIODS     = 20
NUM_MONTHS      = 60
FUND_GIVEN      = 1e6
LOANS_PER_MONTH = 100
CONF_QUANTILE   = (40,100)

print("Mean Total Profits:")
perf_gp = 0
perf_others = 0
perf_bayes_opt = 0

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Mean Total Profits:


In [18]:

perf_gp = simulate_N_time_periods(gp_model, X_val, y_val, X_scaler, y_scaler,
                                  threshold=THRESHOLD, num_periods=NUM_PERIODS, fund_given=FUND_GIVEN, 
                                  num_months=NUM_MONTHS,incoming_loans_per_time_period=LOANS_PER_MONTH,
                                  conf_quantile=CONF_QUANTILE, optimize_for="TODO", 
                                  version="loan_amount_and_variance", model_type="gp", seed=SEED)

from sklearn.svm import SVR
svm_rbf = SVR(kernel="rbf", C=1e4)
svm_rbf.fit(X_train, y_train)
perf_others = simulate_N_time_periods(svm_rbf, X_val, y_val, X_scaler, y_scaler, 
                                      threshold=THRESHOLD, num_periods=NUM_PERIODS, fund_given=FUND_GIVEN, 
                                      num_months=NUM_MONTHS, incoming_loans_per_time_period=LOANS_PER_MONTH,
                                      conf_quantile=None, optimize_for="TODO", 
                                      version="loan_amount", model_type="others", seed=SEED)

from scipy.stats import norm
kappa = norm.ppf(0.7)
perf_bayes_opt = simulate_N_time_periods(gp_model, X_val, y_val, X_scaler, y_scaler,
                                         threshold=THRESHOLD, num_periods=NUM_PERIODS, fund_given=FUND_GIVEN, 
                                         num_months=NUM_MONTHS, incoming_loans_per_time_period=LOANS_PER_MONTH,
                                         conf_quantile=CONF_QUANTILE, optimize_for="TODO",
                                         version="bayesian_optimization", kappa=kappa, bay_opt_steps=200,
                                         model_type='gp', seed=SEED)

print("Mean Total Profits:")
print(np.mean(np.sum(perf_gp[:,:,0], axis=1)))
print(np.mean(np.sum(perf_others[:,:,0], axis=1)))
print(np.mean(np.sum(perf_bayes_opt[:,:,0], axis=1)))

1200
Simulating period 0...




1200


1200


1200


1200


1200


1200


1200


1200


1200


1200
Simulating period 10...


1200


1200


1200


1200


1200


1200


1200


1200


1200


1200
Simulating period 0...


1200


1200


1200


1200


1200


1200


1200


1200


1200


1200
Simulating period 10...


1200


1200


1200


1200


1200


1200


1200


1200


1200


200
Simulating period 0...
60








59


58


57


56


55


54


53


52


51


50


49


48


47


46


45


44


43


42


41


40


39


38


37


36


35


34


33


32


31


30


29


28


27


26


25


24


23


22


21


20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


140
60


59


58


57


56


55


54


53


52


51


50


49


48


47


46


45


44


43


42


41


40


39


38


37


36


35


34


33


32


31


30


29


28


27


26


25


24


23


22


21


20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


80
60


59


58


57


56


55


54


53


52


51


50


49


48


47


46


45


44


43


42


41


40


39


38


37


36


35


34


33


32


31


30


29


28


27


26


25


24


23


22


21


20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


20
20


19


18


17


16


15


14


13


12


11


10


9


8


7


6


5


4


3


2


1


-40


-40


-40


-40


-40


-40


-40
Simulating period 10...


-40


-40


-40


-40


-40


-40


-40


-40


-40


Mean Total Profits:
467498.933538
335791.504236
442997.259658


In [None]:
# from visualisation import plot_portfolio_performance, plot_portfolio_performance_comparisons
# plot_portfolio_performance_comparisons([perf_gp, perf_others, perf_others], legend_names=["GP", "Others", "Others"])
# plot_portfolio_performance(perf_gp)
# plot_portfolio_performance(perf_others)
# # Optimize threshold for profits / profit_percentage
# threshold = gp_optimize_threshold(gp_model, X_val, y_val, X_scaler, y_scaler, optimize_for="profit_percentage")
# print(threshold)
# train_and_test_other_models(X_train, y_train, X_test, y_test, X_scaler)