# Regression on biodiversity index

We are going to test different models on our dataset, trying to get a better results using a grid search and the testing the models on a dataset of a different region.

For each model we're going to use a RandomizedSearchCV to narrow our parameters research and the the GridSearchCV to find the best one.

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import sys
import torch
from itertools import combinations 

from scipy.stats import uniform, randint
from sklearn.decomposition import PCA

from sklearn import preprocessing

from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

import xgboost as xgb

module_this = os.path.abspath(os.path.join(os.getcwd()))
modules = [ module_this]

for module in modules:
    if module not in sys.path:
        sys.path.append(module)

import utils as ut

## Load Datasets

In [2]:
folder = "../Dataset"
regression_label = 'habitat_richness'
test_size = 0.2
#swi_labels = ['SWI1km-SWI-002', 'SWI1km-SWI-100', 'SWI1km-SWI-040', 'SWI1km-SWI-005', 
#              'SWI1km-SWI-010', 'SWI1km-SWI-060', 'SWI1km-SWI-015', 'SWI1km-SWI-020']

datas = []

paths = [f for f in glob.glob(folder + "/*.csv") if 'out_closest_point_mean' in f]
paths += [f for f in glob.glob(folder + "/*.csv") if 'out_knn' in f]
paths += [f for f in glob.glob(folder + "/*.csv") if 'out_mean' in f]
paths += [f for f in glob.glob(folder + "/*.csv") if 'out_remove' in f]

for path in paths:
    df = pd.read_csv(path, index_col=['longitude', 'latitude'])

    if(df.isna().any().any()):
        print(path, '\t has ', df.isna().any().sum(), ' row with null values')
    
    y = df[regression_label].values
    X = df.drop(columns=[regression_label]).values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, shuffle=True)
    datas.append({'name': path[11:-4], 'dataframe': df, 'y': y, 'X': X, 
                'X_train': X_train, 'X_test': X_test, 
            'y_train': y_train, 'y_test': y_test})

In [3]:
for x in datas:
    print(x['name'], "\t--\t",x['dataframe'].shape)

bulgaria_out_closest_point_mean_handle_custom_set 	--	 (14458, 47)
finland_out_closest_point_mean_handle_custom_set 	--	 (17714, 47)
france_out_closest_point_mean_handle_custom_set 	--	 (1882, 47)
italy_out_closest_point_mean_handle_custom_set 	--	 (17387, 47)
bulgaria_out_knn_handle_custom_set 	--	 (14458, 47)
finland_out_knn_handle_custom_set 	--	 (17714, 47)
france_out_knn_handle_custom_set 	--	 (1882, 47)
italy_out_knn_handle_custom_set 	--	 (17387, 47)
bulgaria_out_mean_handle_custom_set 	--	 (14458, 47)
finland_out_mean_handle_custom_set 	--	 (17714, 47)
france_out_mean_handle_custom_set 	--	 (1882, 47)
italy_out_mean_handle_custom_set 	--	 (17387, 47)
bulgaria_out_remove_handle_set_null 	--	 (6842, 47)
finland_out_remove_handle_set_null 	--	 (17654, 47)
france_out_remove_handle_set_null 	--	 (1138, 47)
italy_out_remove_handle_set_null 	--	 (8273, 47)


In [4]:
X_closest_point = np.vstack((datas[0]['X'], datas[1]['X'], datas[2]['X'], datas[3]['X']))
y_closest_point = np.concatenate((datas[0]['y'], datas[1]['y'], datas[2]['y'], datas[3]['y']))
region_id_closest_point = np.hstack(np.array([[i] * d['X'].shape[0] for i, d in enumerate(datas[0:4])]))

X_knn = np.vstack((datas[4]['X'], datas[5]['X'], datas[6]['X'], datas[7]['X']))
y_knn = np.concatenate((datas[4]['y'], datas[5]['y'], datas[6]['y'], datas[7]['y']))
region_id_knn = np.hstack(np.array([[i] * d['X'].shape[0] for i, d in enumerate(datas[4:8])]))

X_mean = np.vstack((datas[8]['X'], datas[9]['X'], datas[10]['X'], datas[11]['X']))
y_mean = np.concatenate((datas[8]['y'], datas[9]['y'], datas[10]['y'], datas[11]['y']))
region_id_mean = np.hstack(np.array([[i] * d['X'].shape[0] for i, d in enumerate(datas[8:12])]))

X_remove = np.vstack((datas[12]['X'], datas[13]['X'], datas[14]['X'], datas[15]['X']))
y_remove = np.concatenate((datas[12]['y'], datas[13]['y'], datas[14]['y'], datas[15]['y']))
region_id_remove = np.hstack(np.array([[i] * d['X'].shape[0] for i, d in enumerate(datas[12:16])]))


Xs = [X_closest_point, X_knn, X_mean, X_remove]
ys = [y_closest_point, y_knn, y_mean, y_remove]
region_ids = [region_id_closest_point, region_id_knn, region_id_mean, region_id_remove]

cv = KFold(n_splits=4, shuffle=True, random_state = 42)
scaler = preprocessing.MinMaxScaler()
pca = PCA()

## Searching the best outliers handler

We train a random forest regressor over datasets created with different outlier handling techniques to see which one performs better

### Regional dataset

In [10]:
for data in datas:
    rfr = RandomForestRegressor(max_depth=100, random_state = 42)
    model = make_pipeline(scaler, rfr)
    val_score = cross_val_score(model, data['X'], data['y'], cv=cv, n_jobs=-1, verbose=0)
    data['val_score'] = val_score
    print(data['name'], "\t validation score: \t", "{:.3f}".format(val_score.mean()), " +/- ", "{:.3f}".format(val_score.std()))
    print('--------------------------------------------')

bulgaria_out_closest_point_mean_handle_custom_set 	 validation score: 	 0.925  +/-  0.005
--------------------------------------------
finland_out_closest_point_mean_handle_custom_set 	 validation score: 	 0.717  +/-  0.011
--------------------------------------------
france_out_closest_point_mean_handle_custom_set 	 validation score: 	 0.742  +/-  0.013
--------------------------------------------
italy_out_closest_point_mean_handle_custom_set 	 validation score: 	 0.835  +/-  0.003
--------------------------------------------
bulgaria_out_knn_handle_custom_set 	 validation score: 	 0.923  +/-  0.005
--------------------------------------------
finland_out_knn_handle_custom_set 	 validation score: 	 0.717  +/-  0.011
--------------------------------------------
france_out_knn_handle_custom_set 	 validation score: 	 0.735  +/-  0.018
--------------------------------------------
italy_out_knn_handle_custom_set 	 validation score: 	 0.827  +/-  0.004
-------------------------------------

### Whole dataset

In [6]:
cv = KFold(n_splits=4, shuffle=True, random_state = 42)
scaler = preprocessing.MinMaxScaler()

In [8]:
rfr = RandomForestRegressor(max_depth=100, random_state = 42)
model = make_pipeline(scaler, rfr)
for X, y, region_id in zip(Xs, ys, region_ids):
    val_score = cross_val_score(model, X, y, cv=cv, n_jobs=-1, verbose=0)
    print("Validation score: \t", "{:.3f}".format(val_score.mean()), " +/- ", "{:.3f}".format(val_score.std()))
    print('--------------------------------------------')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:  3.3min remaining:  3.3min
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.4min finished


All data validation score: 	 0.961  +/-  0.002
	 test score: 		 0.962
--------------------------------------------


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:  3.5min remaining:  3.5min
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.5min finished


All data validation score: 	 0.960  +/-  0.002
	 test score: 		 0.960
--------------------------------------------


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:  4.0min remaining:  4.0min
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  4.0min finished


All data validation score: 	 0.961  +/-  0.001
	 test score: 		 0.961
--------------------------------------------


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:  2.6min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.6min finished


All data validation score: 	 0.954  +/-  0.001
	 test score: 		 0.959
--------------------------------------------


As we can see, the only outliers handler that cause a meaningful score reduction is the removal of rows with outliers.
The other techniques have similar scores, we choose to continue with *mean of closest point* since it performed slightly better
than the other two

In [5]:
datas_closest_point = datas[0:4]
X = X_closest_point
y = y_closest_point
X_train = np.vstack((datas[0]['X_train'], datas[1]['X_train'], datas[2]['X_train'], datas[3]['X_train']))
y_train = np.concatenate((datas[0]['y_train'], datas[1]['y_train'], datas[2]['y_train'], datas[3]['y_train']))
X_test = np.vstack((datas[0]['X_test'], datas[1]['X_test'], datas[2]['X_test'], datas[3]['X_test']))
y_test = np.concatenate((datas[0]['y_test'], datas[1]['y_test'], datas[2]['y_test'], datas[3]['y_test']))
data_all = {'name': 'Whole dataset', 'X': X, 'y': y, 'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test}


## Grid search

In [6]:
def grid_search_train_test(data, grid_search):
    print(data['name'])
    grid_search.fit(data['X_train'], data['y_train'])
    print('\tcross validation best score: \t',  "{:.3f}".format(grid_search.best_score_))
    print('\tbest params: ', grid_search.best_params_)
    best_model = grid_search.best_estimator_
    test_score = best_model.score(data['X_test'], data['y_test'])
    print("\tbest model test score: \t\t", "{:.3f}".format(test_score))
    print('--------------------------------------------')
    return grid_search.best_estimator_

def model_train_test(data, model, cv):
    print(data['name'])
    val_score = cross_val_score(model, data['X_train'], data['y_train'], cv=cv, n_jobs=-1, verbose=0)
    print('all scores:', val_score)
    print("\t validation score: \t", "{:.3f}".format(val_score.mean()), " +/- ", "{:.3f}".format(val_score.std()*2))
    model.fit(data['X_train'], data['y_train'])
    test_score = model.score(data['X_test'], data['y_test'])
    print("\tbest model test score: \t\t", "{:.3f}".format(test_score))
    print('--------------------------------------------')
    return model

def apply_trained_model(datas, model):
    for data in datas:
        test_score = model.score(data['X_test'], data['y_test'])
        print(data['name'], "\tbest all data model score: \t\t", "{:.3f}".format(test_score))
        print('--------------------------------------------')

### Decision Tree

In [74]:
max_features = ['auto', 'sqrt']
criterion = ['mse', 'friedman_mse']
max_depth = [4, 8, 16, 32, None]

grid_params = {'dtr__max_features': max_features,
              'dtr__criterion': criterion,
              'dtr__max_depth': max_depth}
grid_params

{'dtr__max_features': ['auto', 'sqrt'],
 'dtr__criterion': ['mse', 'friedman_mse'],
 'dtr__max_depth': [4, 8, 16, 32, None]}

In [75]:
print(" ------- Regional datas ------- \n")
dtr = DecisionTreeRegressor()
model = Pipeline([('scaler', scaler), ('dtr', dtr)])

grid_search = GridSearchCV(estimator = model, param_grid = grid_params, 
                              cv = cv, n_jobs = -1, verbose = 0)

for data in datas_closest_point:
    grid_search_train_test(data, grid_search)

 ------- Regional datas ------- 

france_out_closest_point_mean_handle_custom_set
	cross validation best score: 	 0.569
	best params:  {'dtr__criterion': 'mse', 'dtr__max_depth': 8, 'dtr__max_features': 'auto'}
	best model test score: 		 0.585
--------------------------------------------


In [76]:
print("\n ------- All datas ------- \n")
best_all_estimator = grid_search_train_test(data_all, grid_search)

print(" ------- Best all estimator on regional data ------- \n")
apply_trained_model(datas_closest_point, best_all_estimator)


 ------- All datas ------- 

Whole dataset
	cross validation best score: 	 0.927
	best params:  {'dtr__criterion': 'friedman_mse', 'dtr__max_depth': 16, 'dtr__max_features': 'auto'}
	best model test score: 		 0.934
--------------------------------------------
 ------- Best all estimator on regional data ------- 

france_out_closest_point_mean_handle_custom_set 	best all data model score: 		 0.600
--------------------------------------------


### Random Forest Regressor

In [234]:
n_estimators = [50, 100, 200]
max_features = ['auto', 'sqrt']
max_depth = [20, 50, None]
min_samples_split = [2, 5]
min_samples_leaf = [1, 4]
bootstrap = [False, True]

grid_params = {'rfr__n_estimators': n_estimators,
               'rfr__max_features': max_features,
               'rfr__max_depth': max_depth,
               #'rfr__min_samples_split': min_samples_split,
               #'rfr__min_samples_leaf': min_samples_leaf,
               #'rfr__bootstrap': bootstrap
              }

In [235]:
print(" ------- Regional datas ------- \n")
rfr = RandomForestRegressor()
model = Pipeline([('scaler', scaler), ('rfr', rfr)])

grid_search = GridSearchCV(estimator = model, param_grid = grid_params, 
                              cv = cv, n_jobs = -1, verbose = 0)

for data in datas_closest_point:
    grid_search_train_test(data, grid_search)

 ------- Regional datas ------- 

france_out_closest_point_mean_handle_custom_set
	cross validation best score: 	 0.734
	best params:  {'rfr__max_depth': 20, 'rfr__max_features': 'sqrt', 'rfr__n_estimators': 200}
	best model test score: 		 0.771
--------------------------------------------


In [None]:
print("\n ------- All datas ------- \n")
best_all_estimator = grid_search_train_test(data_all, grid_search)

print(" ------- Best all estimator on regional data ------- \n")
apply_trained_model(datas_closest_point, best_all_estimator)

### Linear regressor

In [8]:
print(" ------- Regional datas ------- \n")
lr = LinearRegression()
model = Pipeline([('scaler', scaler), ('lr', lr)])

for data in datas_closest_point:
    model_train_test(data, model, cv)

 ------- Regional datas ------- 

bulgaria_out_closest_point_mean_handle_custom_set
all scores: [0.6842919  0.653286   0.63819187 0.6732945 ]
	 validation score: 	 0.662  +/-  0.036
	best model test score: 		 0.689
--------------------------------------------
finland_out_closest_point_mean_handle_custom_set
all scores: [0.14672251 0.15110454 0.13723825 0.1275094 ]
	 validation score: 	 0.141  +/-  0.018
	best model test score: 		 0.138
--------------------------------------------
france_out_closest_point_mean_handle_custom_set
all scores: [-24.86678018   0.5304591    0.47770628   0.56657461]
	 validation score: 	 -5.823  +/-  21.990
	best model test score: 		 0.601
--------------------------------------------
italy_out_closest_point_mean_handle_custom_set
all scores: [0.51655417 0.53508659 0.51703452 0.53220658]
	 validation score: 	 0.525  +/-  0.017
	best model test score: 		 0.530
--------------------------------------------


In [10]:
print("\n ------- All datas ------- \n")
best_all_estimator = model_train_test(data_all, model, cv)

print(" ------- Best all estimator on regional data ------- \n")
apply_trained_model(datas_closest_point, best_all_estimator)


 ------- All datas ------- 

Whole dataset
all scores: [0.76476679 0.77697724 0.77171997 0.77770604]
	 validation score: 	 0.773  +/-  0.010
	best model test score: 		 0.770
--------------------------------------------
 ------- Best all estimator on regional data ------- 

bulgaria_out_closest_point_mean_handle_custom_set 	best all data model score: 		 0.463
--------------------------------------------
finland_out_closest_point_mean_handle_custom_set 	best all data model score: 		 -0.168
--------------------------------------------
france_out_closest_point_mean_handle_custom_set 	best all data model score: 		 -0.894
--------------------------------------------
italy_out_closest_point_mean_handle_custom_set 	best all data model score: 		 0.134
--------------------------------------------


### SVM

In [7]:
C = [0.1, 1, 10]
gamma = ['auto', 0.1, 0.01]
kernel = ['linear', 'rbf']

grid_params = {'svr__C': C,  
              'svr__gamma': gamma, 
              'svr__kernel': kernel}  
grid_params

{'svr__C': [0.1, 1, 10],
 'svr__gamma': ['auto', 0.1, 0.01],
 'svr__kernel': ['linear', 'rbf']}

In [8]:
print(" ------- Regional datas ------- \n")
svr = SVR()
model = Pipeline([('scaler', scaler), ('svr', svr)])

grid_search = GridSearchCV(estimator = model, param_grid = grid_params, 
                              cv = cv, n_jobs = -1, verbose = 0)

for data in datas_closest_point:
    grid_search_train_test(data, grid_search)

 ------- Regional datas ------- 

bulgaria_out_closest_point_mean_handle_custom_set
	cross validation best score: 	 0.818
	best params:  {'svr__C': 10, 'svr__gamma': 0.1, 'svr__kernel': 'rbf'}
	best model test score: 		 0.832
--------------------------------------------
finland_out_closest_point_mean_handle_custom_set
	cross validation best score: 	 0.112
	best params:  {'svr__C': 10, 'svr__gamma': 0.1, 'svr__kernel': 'rbf'}
	best model test score: 		 0.136
--------------------------------------------
france_out_closest_point_mean_handle_custom_set
	cross validation best score: 	 0.281
	best params:  {'svr__C': 1, 'svr__gamma': 'auto', 'svr__kernel': 'linear'}
	best model test score: 		 0.421
--------------------------------------------
italy_out_closest_point_mean_handle_custom_set
	cross validation best score: 	 0.528
	best params:  {'svr__C': 10, 'svr__gamma': 0.1, 'svr__kernel': 'rbf'}
	best model test score: 		 0.536
--------------------------------------------


In [9]:
print("\n ------- All datas ------- \n")
best_all_estimator = grid_search_train_test(data_all, grid_search)

print(" ------- Best all estimator on regional data ------- \n")
apply_trained_model(datas_closest_point, best_all_estimator)


 ------- All datas ------- 

Whole dataset
	cross validation best score: 	 0.876
	best params:  {'svr__C': 10, 'svr__gamma': 0.1, 'svr__kernel': 'rbf'}
	best model test score: 		 0.883
--------------------------------------------
 ------- Best all estimator on regional data ------- 

bulgaria_out_closest_point_mean_handle_custom_set 	best all data model score: 		 0.772
--------------------------------------------
finland_out_closest_point_mean_handle_custom_set 	best all data model score: 		 0.116
--------------------------------------------
france_out_closest_point_mean_handle_custom_set 	best all data model score: 		 0.191
--------------------------------------------
italy_out_closest_point_mean_handle_custom_set 	best all data model score: 		 0.472
--------------------------------------------


## Neural Network

In [10]:
max_iter = [500, 1000]
alpha = [0.001, 0.0001, 0.00001]
solver = ['lbfgs']
hidden_layer_sizes = [(100, 50), (46, 23), (46, 100, 46), (46, 100, 46, 20)]

grid_params = {'nn__solver': solver, 
               'nn__max_iter': max_iter,
               'nn__alpha': alpha, 
               'nn__hidden_layer_sizes': hidden_layer_sizes
              }
grid_params

{'nn__solver': ['lbfgs'],
 'nn__max_iter': [500, 1000],
 'nn__alpha': [0.001, 0.0001, 1e-05],
 'nn__hidden_layer_sizes': [(100, 50),
  (46, 23),
  (46, 100, 46),
  (46, 100, 46, 20)]}

In [11]:
print(" ------- Regional datas ------- \n")
nn = MLPRegressor()
model = Pipeline([('scaler', scaler), ('nn', nn)])

grid_search = GridSearchCV(estimator = model, param_grid = grid_params, 
                              cv = cv, n_jobs = -1, verbose = 0)

for data in datas_closest_point:
    grid_search_train_test(data, grid_search)

 ------- Regional datas ------- 

bulgaria_out_closest_point_mean_handle_custom_set
	cross validation best score: 	 0.885
	best params:  {'nn__alpha': 1e-05, 'nn__hidden_layer_sizes': (46, 100, 46, 20), 'nn__max_iter': 1000, 'nn__solver': 'lbfgs'}
	best model test score: 		 0.894
--------------------------------------------
finland_out_closest_point_mean_handle_custom_set
	cross validation best score: 	 0.294
	best params:  {'nn__alpha': 1e-05, 'nn__hidden_layer_sizes': (100, 50), 'nn__max_iter': 500, 'nn__solver': 'lbfgs'}
	best model test score: 		 0.184
--------------------------------------------
france_out_closest_point_mean_handle_custom_set
	cross validation best score: 	 0.611
	best params:  {'nn__alpha': 0.0001, 'nn__hidden_layer_sizes': (100, 50), 'nn__max_iter': 500, 'nn__solver': 'lbfgs'}
	best model test score: 		 0.674
--------------------------------------------
italy_out_closest_point_mean_handle_custom_set
	cross validation best score: 	 0.643
	best params:  {'nn__alph

In [12]:
print("\n ------- All datas ------- \n")
best_all_estimator = grid_search_train_test(data_all, grid_search)

print(" ------- Best all estimator on regional data ------- \n")
apply_trained_model(datas_closest_point, best_all_estimator)


 ------- All datas ------- 

Whole dataset
	cross validation best score: 	 0.907
	best params:  {'nn__alpha': 1e-05, 'nn__hidden_layer_sizes': (46, 100, 46), 'nn__max_iter': 1000, 'nn__solver': 'lbfgs'}
	best model test score: 		 0.909
--------------------------------------------
 ------- Best all estimator on regional data ------- 

bulgaria_out_closest_point_mean_handle_custom_set 	best all data model score: 		 0.843
--------------------------------------------
finland_out_closest_point_mean_handle_custom_set 	best all data model score: 		 0.167
--------------------------------------------
france_out_closest_point_mean_handle_custom_set 	best all data model score: 		 0.295
--------------------------------------------
italy_out_closest_point_mean_handle_custom_set 	best all data model score: 		 0.577
--------------------------------------------


### Leave one region out

In [114]:
reg_idx = [0,1,2,3]
comb = combinations(reg_idx, 3)

for c in list(comb):
    X_train = []
    y_train = []
    for i in c:
        X_train.append(datas[i]['X'])
        y_train.append(datas[i]['y'])
        
    X_train = np.vstack(X_train)
    y_train = np.hstack(y_train)
    
    test_idx = list(set(reg_idx) - set(c))[0]
   
    X_test = datas[test_idx]['X']
    y_test = datas[test_idx]['y']
    
    rfr = RandomForestRegressor(random_state = 42)
    model = make_pipeline(scaler, rfr)
    model.fit(X_train, y_train)
    test_score = model.score(X_test, y_test)

    print("Prediction of", datas[test_idx]['name'], "score: ", "{:.3f}".format(test_score))
    print('------------------------------')
    
    

Prediction of finland_out_remove_handle_set_null score:  -26.166
------------------------------
Prediction of italy_out_closest_point_mean_handle_custom_set score:  -5.524
------------------------------
Prediction of france_out_closest_point_mean_handle_custom_set score:  -0.908
------------------------------
Prediction of bulgaria_out_closest_point_mean_handle_custom_set score:  -2.060
------------------------------
