# Using the QLattice with small data sets

In many cases, a researcher has collected a fairly small dataset of a few hundred individuals. In this notebook we apply a systematic approach to test the performance of the QLattice against the usual go-to technologies for fitting and ML.

We compare to other **interpretable** models:
- Linear models (both with and without LASSO)
- Decision Trees

And to other **ensemble models** which are more black-box:
- Random Forest
- Gradient Boosting

In [32]:
import pmlb
import pandas as pd
import feyn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time
import multiprocessing

In [2]:
import numpy as np

In [3]:
from scipy.stats import gaussian_kde

In [4]:
print(len(pmlb.classification_dataset_names)+len(pmlb.regression_dataset_names))

284


In [5]:
len(pmlb.regression_dataset_names)

122

In [None]:
#for name in pmlb.regression_dataset_names:
#    df = pmlb.fetch_data(name, local_cache_dir="/tmp/pmlb_data")
#    print(f"('{name}', {len(df)}, {len(df.columns)}),")   

In [6]:
datasets = pd.DataFrame([
('1027_ESL', 488, 5),
('1028_SWD', 1000, 11),
('1029_LEV', 1000, 5),
('1030_ERA', 1000, 5),
('1089_USCrime', 47, 14),
('1096_FacultySalaries', 50, 5),
('1191_BNG_pbc', 1000000, 19),
('1193_BNG_lowbwt', 31104, 10),
('1196_BNG_pharynx', 1000000, 11),
('1199_BNG_echoMonths', 17496, 10),
('1201_BNG_breastTumor', 116640, 10),
('1203_BNG_pwLinear', 177147, 11),
('1595_poker', 1025010, 11),
('192_vineyard', 52, 3),
('195_auto_price', 159, 16),
('197_cpu_act', 8192, 22),
('201_pol', 15000, 49),
('207_autoPrice', 159, 16),
('210_cloud', 108, 6),
('215_2dplanes', 40768, 11),
('218_house_8L', 22784, 9),
('225_puma8NH', 8192, 9),
('227_cpu_small', 8192, 13),
('228_elusage', 55, 3),
('229_pwLinear', 200, 11),
('230_machine_cpu', 209, 7),
('294_satellite_image', 6435, 37),
('344_mv', 40768, 11),
('4544_GeographicalOriginalofMusic', 1059, 118),
('485_analcatdata_vehicle', 48, 5),
('503_wind', 6574, 15),
('505_tecator', 240, 125),
('519_vinnie', 380, 3),
('522_pm10', 500, 8),
('523_analcatdata_neavote', 100, 3),
('527_analcatdata_election2000', 67, 15),
('529_pollen', 3848, 5),
('537_houses', 20640, 9),
('542_pollution', 60, 16),
('547_no2', 500, 8),
('556_analcatdata_apnea2', 475, 4),
('557_analcatdata_apnea1', 475, 4),
('560_bodyfat', 252, 15),
('561_cpu', 209, 8),
('562_cpu_small', 8192, 13),
('564_fried', 40768, 11),
('573_cpu_act', 8192, 22),
('574_house_16H', 22784, 17),
('579_fri_c0_250_5', 250, 6),
('581_fri_c3_500_25', 500, 26),
('582_fri_c1_500_25', 500, 26),
('583_fri_c1_1000_50', 1000, 51),
('584_fri_c4_500_25', 500, 26),
('586_fri_c3_1000_25', 1000, 26),
('588_fri_c4_1000_100', 1000, 101),
('589_fri_c2_1000_25', 1000, 26),
('590_fri_c0_1000_50', 1000, 51),
('591_fri_c1_100_10', 100, 11),
('592_fri_c4_1000_25', 1000, 26),
('593_fri_c1_1000_10', 1000, 11),
('594_fri_c2_100_5', 100, 6),
('595_fri_c0_1000_10', 1000, 11),
('596_fri_c2_250_5', 250, 6),
('597_fri_c2_500_5', 500, 6),
('598_fri_c0_1000_25', 1000, 26),
('599_fri_c2_1000_5', 1000, 6),
('601_fri_c1_250_5', 250, 6),
('602_fri_c3_250_10', 250, 11),
('603_fri_c0_250_50', 250, 51),
('604_fri_c4_500_10', 500, 11),
('605_fri_c2_250_25', 250, 26),
('606_fri_c2_1000_10', 1000, 11),
('607_fri_c4_1000_50', 1000, 51),
('608_fri_c3_1000_10', 1000, 11),
('609_fri_c0_1000_5', 1000, 6),
('611_fri_c3_100_5', 100, 6),
('612_fri_c1_1000_5', 1000, 6),
('613_fri_c3_250_5', 250, 6),
('615_fri_c4_250_10', 250, 11),
('616_fri_c4_500_50', 500, 51),
('617_fri_c3_500_5', 500, 6),
('618_fri_c3_1000_50', 1000, 51),
('620_fri_c1_1000_25', 1000, 26),
('621_fri_c0_100_10', 100, 11),
('622_fri_c2_1000_50', 1000, 51),
('623_fri_c4_1000_10', 1000, 11),
('624_fri_c0_100_5', 100, 6),
('626_fri_c2_500_50', 500, 51),
('627_fri_c2_500_10', 500, 11),
('628_fri_c3_1000_5', 1000, 6),
('631_fri_c1_500_5', 500, 6),
('633_fri_c0_500_25', 500, 26),
('634_fri_c2_100_10', 100, 11),
('635_fri_c0_250_10', 250, 11),
('637_fri_c1_500_50', 500, 51),
('641_fri_c1_500_10', 500, 11),
('643_fri_c2_500_25', 500, 26),
('644_fri_c4_250_25', 250, 26),
('645_fri_c3_500_50', 500, 51),
('646_fri_c3_500_10', 500, 11),
('647_fri_c1_250_10', 250, 11),
('648_fri_c1_250_50', 250, 51),
('649_fri_c0_500_5', 500, 6),
('650_fri_c0_500_50', 500, 51),
('651_fri_c0_100_25', 100, 26),
('653_fri_c0_250_25', 250, 26),
('654_fri_c0_500_10', 500, 11),
('656_fri_c1_100_5', 100, 6),
('657_fri_c2_250_10', 250, 11),
('658_fri_c3_250_25', 250, 26),
('659_sleuth_ex1714', 47, 8),
('663_rabe_266', 120, 3),
('665_sleuth_case2002', 147, 7),
('666_rmftsa_ladata', 508, 11),
('678_visualizing_environmental', 111, 4),
('687_sleuth_ex1605', 62, 6),
('690_visualizing_galaxy', 323, 5),
('695_chatfield_4', 235, 13),
('706_sleuth_case1202', 93, 7),
('712_chscase_geyser1', 222, 3),
('banana', 5300, 3),
('titanic', 2201, 4),
],
columns=["name","n","fcount"])

In [7]:
chosen_datasets = datasets[(datasets["n"]>=1000)]
len(chosen_datasets)

48

In [8]:
datasets

Unnamed: 0,name,n,fcount
0,1027_ESL,488,5
1,1028_SWD,1000,11
2,1029_LEV,1000,5
3,1030_ERA,1000,5
4,1089_USCrime,47,14
...,...,...,...
117,695_chatfield_4,235,13
118,706_sleuth_case1202,93,7
119,712_chscase_geyser1,222,3
120,banana,5300,3


In [9]:
datasets.n.mean()

30172.43442622951

In [10]:
datasets.n.median()

500.0

# Unility functions

In [11]:
def get_pmlb_data(name, randomseed, trainsize=250):
    df = pmlb.fetch_data(name, local_cache_dir="pmlb_data")
    return train_test_split(df,train_size=trainsize, random_state=randomseed)

from sklearn import svm, tree, linear_model, ensemble

def X(df):
    return df.iloc[:,:-1]

def y(df):
    return df.iloc[:,-1]

def fit_and_r2_score(model, train, test):
    model.fit(X(train), y(train))
    return model.score(X(train), y(train)), model.score(X(test), y(test))

# Compare to the usual suspects

In [23]:
results = pd.DataFrame(columns=["dataset", "model", "randomseed", "train_r2", "test_r2"])
#results = pd.read_csv("results-cache-wip.csv")

In [24]:
results

Unnamed: 0,dataset,model,randomseed,train_r2,test_r2


In [25]:
results.dataset.value_counts()

Series([], Name: dataset, dtype: int64)

In [26]:
results.randomseed.value_counts()

Series([], Name: randomseed, dtype: int64)

In [27]:
results.model.unique()

array([], dtype=object)

In [28]:
#results = results[~(results["model"].str.startswith("GradientBoostingRegressor(n_estimators=300)"))].reindex()

In [29]:
models = [
    linear_model.LinearRegression(),
#    linear_model.Lasso(alpha=0.01, max_iter=100000),
    linear_model.Lasso(alpha=0.05, max_iter=100000),
#    linear_model.Lasso(alpha=0.10, max_iter=100000),

#    tree.DecisionTreeRegressor(max_depth=1),
#    tree.DecisionTreeRegressor(max_depth=2),
#    tree.DecisionTreeRegressor(max_depth=4),
#    tree.DecisionTreeRegressor(max_depth=6),
    
#    ensemble.RandomForestRegressor(n_estimators=400),
    ensemble.RandomForestRegressor(n_estimators=200),
#    ensemble.RandomForestRegressor(n_estimators=100),
#    ensemble.RandomForestRegressor(n_estimators=50),

#    ensemble.GradientBoostingRegressor(n_estimators=400),
    ensemble.GradientBoostingRegressor(n_estimators=200),
#    ensemble.GradientBoostingRegressor(n_estimators=100),
#    ensemble.GradientBoostingRegressor(n_estimators=50),
]

In [30]:
def fit_comparison(name, randomseed):    
    global results

    print("Seed %i, Dataset: %s"%(randomseed,name), end="")
    train, test = get_pmlb_data(name, randomseed)
    print(" ... fetched", end="")
    for m in models:
        if ((results["dataset"]==name) & (results["model"]==str(m)) & (results["randomseed"]==randomseed)).any():
            # Skip if already run
            continue
        r2_train, r2_test = fit_and_r2_score(m, train, test)
        new = pd.DataFrame([{"dataset": name, "model": str(m), "randomseed": randomseed, "train_r2": r2_train, "test_r2": r2_test}])
        results = pd.concat([results, new])

    print(" ... and fitted")


In [39]:
# Create a pool of workers to run in parallel
with multiprocessing.Pool(processes=6) as pool:
    jobargs = [(name, seed) for name in chosen_datasets["name"] for seed in range(0,5) ]

    # Map the function to the inputs, distributing the work across multiple processes
    r = pool.starmap(fit_comparison, jobargs, chunksize=1)



#for name in chosen_datasets["name"]:
#    for randomseed in range(0,5):
#        fit_comparison(name, randomseed)

Seed 3, Dataset: 1028_SWDSeed 2, Dataset: 1028_SWDSeed 0, Dataset: 1028_SWDSeed 1, Dataset: 1028_SWDSeed 4, Dataset: 1028_SWDSeed 0, Dataset: 1029_LEV ... fetched ... fetched ... fetched ... fetched ... fetched ... fetched ... and fitted ... and fitted

 ... and fittedSeed 1, Dataset: 1029_LEVSeed 2, Dataset: 1029_LEV ... and fitted

Seed 3, Dataset: 1029_LEVSeed 4, Dataset: 1029_LEV ... fetched ... fetched ... fetched ... fetched ... and fitted
 ... and fittedSeed 0, Dataset: 1030_ERA
 ... and fittedSeed 1, Dataset: 1030_ERA

 ... and fittedSeed 2, Dataset: 1030_ERASeed 3, Dataset: 1030_ERA ... and fitted
 ... fetchedSeed 4, Dataset: 1030_ERA ... fetched ... fetched ... fetched ... and fitted ... fetched
 ... and fitted ... and fitted ... and fittedSeed 0, Dataset: 1191_BNG_pbc ... and fitted

Seed 1, Dataset: 1191_BNG_pbcSeed 2, Dataset: 1191_BNG_pbc
 ... and fittedSeed 3, Dataset: 1191_BNG_pbc

Seed 4, Dataset: 1191_BNG_pbcSeed 0, Dataset: 1193_BNG_lowbwt ... fetched ... and fitted


Seed 2, Dataset: 588_fri_c4_1000_100 ... fetched ... and fitted
Seed 3, Dataset: 588_fri_c4_1000_100 ... and fitted
Seed 4, Dataset: 588_fri_c4_1000_100 ... fetched ... fetched ... and fitted
Seed 0, Dataset: 589_fri_c2_1000_25 ... fetched ... and fitted
Seed 1, Dataset: 589_fri_c2_1000_25 ... fetched ... and fitted
Seed 2, Dataset: 589_fri_c2_1000_25 ... fetched ... and fitted
 ... and fittedSeed 3, Dataset: 589_fri_c2_1000_25
 ... fetchedSeed 4, Dataset: 589_fri_c2_1000_25 ... fetched ... and fitted
Seed 0, Dataset: 590_fri_c0_1000_50 ... fetched ... and fitted
Seed 1, Dataset: 590_fri_c0_1000_50 ... fetched ... and fitted
Seed 2, Dataset: 590_fri_c0_1000_50 ... fetched ... and fitted
Seed 3, Dataset: 590_fri_c0_1000_50 ... fetched ... and fitted
Seed 4, Dataset: 590_fri_c0_1000_50 ... fetched ... and fitted
Seed 0, Dataset: 592_fri_c4_1000_25 ... fetched ... and fitted
Seed 1, Dataset: 592_fri_c4_1000_25 ... fetched ... and fitted
Seed 2, Dataset: 592_fri_c4_1000_25 ... fetched ... 

# Fit a qgraph for each data set

In [None]:
def fit_qmodel(edges, criterion, randomseed):
    global results

    key = f"QG-{edges}-{criterion}"

    for name in chosen_datasets["name"]:    
        if ((results["dataset"]==name) & (results["model"]==key) & (results["randomseed"]==randomseed)).any():
            # Skip if already run
            continue

        train, test = get_pmlb_data(name, randomseed)
        ql.reset(randomseed)
        
        qg = ql.get_regressor(train.columns, train.columns[-1]).filter(feyn.filters.MaxEdges(edges))
    
        for _ in range(100):
            qg.fit(train, threads=10, criterion=criterion)
            print(key, randomseed, name)
            print("Train:\t", qg[0].r2_score(train), "\nTest:\t", qg[0].r2_score(test))

            ql.update(qg.best())

        for _ in range(1000):
            qg[0].fit(train)

        results = results.append({"dataset": name, "model": key, "randomseed": randomseed, "train_r2": qg[0].r2_score(train), "test_r2": qg[0].r2_score(test)}, ignore_index=True)

        time.sleep(6) # Protect my poor cpu

# Fit all QGraphs

In [None]:
for randomseed in range(0,5):
    fit_qgraph(11, "bic", randomseed=randomseed)
    fit_qgraph(11, "aic", randomseed=randomseed)

In [None]:
def hist_among(models=None, datasets=None):
    if models is None:
        models = results["model"].unique()
    if datasets is None:
        datasets = results["dataset"].unique()

    for dataset in datasets:
        subset = results[(results["dataset"] == dataset) & (results["test_r2"]>-1)]
        seeds = subset["randomseed"].unique()
        for seed in seeds:
            subsubset = subset[subset["randomseed"]==seed].sort_values(by="test_r2")
            subsubset.plot.barh(x="model", y=["test_r2","train_r2"], title=dataset+" "+str(seed),figsize=(8,6))
                

In [None]:
hist_among(None,["598_fri_c0_1000_25"])

In [None]:
#for name in chosen_datasets["name"]:
#    for seed in range(0,5):
#        subres = results[(results["dataset"] == name) & (results["randomseed"]==seed) & (results["test_r2"]>-1)].sort_values(by="test_r2")
#        if len(subres):
#            subres.plot.barh(x="model", y=["test_r2","train_r2"], title=name,figsize=(8,6))

In [40]:
results.to_csv(f"results-cache-wip.csv", index=False)

# Compare all models

In [None]:
def rank_among(models=None, rankpositions = None):
    if models is None:
        models = results["model"].unique()

    if rankpositions is None:
        rankpositions = len(models)-1

    # Only consider results for the chosen models
    res = results[results["model"].isin(models)].sort_values(by="test_r2", ascending=False)
    
    points = {m: 0 for m in models}
    
    for name in res["dataset"].unique(): # For each dataset
        for seed in res["randomseed"].unique(): # For each seed
            subset = res[(res["dataset"]==name) & (res["randomseed"]==seed)]
            if len(subset):
                for rank in range(rankpositions): # For each rank position
                    m = subset.iloc[rank].model
                    r2 = subset.iloc[rank].test_r2
                    points[m] += rankpositions - rank
    return pd.DataFrame(points.items(), columns=["model", "points"]).sort_values(by="points", ascending=True)

In [None]:
rank_among(rankpositions=1).plot.barh(x="model", y="points", label="First places", figsize=(8,6), xlabel="")

In [None]:
rank_among(rankpositions=1)

In [None]:
rank_among(rankpositions=5).plot.barh(x="model", y="points", label="Points", figsize=(8,6), xlabel="")

In [None]:
rank_among(rankpositions=5)

In [None]:
rank_among(
    models=["QG-11-bic", 
            "GradientBoostingRegressor(n_estimators=400)", 
            "RandomForestRegressor(n_estimators=400)",
            "Lasso(alpha=0.01, max_iter=100000)",
            "DecisionTreeRegressor(max_depth=1)"
           ],
    rankpositions=5
).plot.barh(x="model", y="points", label="First places", figsize=(8,1.6), xlabel="")

### Rankings to LaTeX table

In [None]:
first_scoring = rank_among(rankpositions=1)
second_scoring = rank_among(rankpositions=5)
best_scoring = rank_among(
    models=["QG-11-bic", 
            "Lasso(alpha=0.1, max_iter=100000)",
            "GradientBoostingRegressor(n_estimators=400)", 
            "RandomForestRegressor(n_estimators=400)",
            "DecisionTreeRegressor(max_depth=2)"
           ],
    rankpositions=1
)
best_weighted = rank_among(
    models=["QG-11-bic", 
            "Lasso(alpha=0.1, max_iter=100000)",
            "GradientBoostingRegressor(n_estimators=400)", 
            "RandomForestRegressor(n_estimators=400)",
            "DecisionTreeRegressor(max_depth=2)"
           ],
    rankpositions=5
)

In [None]:
first_scoring["First places"] = first_scoring.points
second_scoring["Weighted scoring"] = second_scoring.points
best_scoring["First places for best"] = best_scoring.points
best_weighted["Weighted scoring for best"] = best_weighted.points


first_scoring.index = first_scoring.model
second_scoring.index = first_scoring.model
best_scoring.index = best_scoring.model
best_weighted.index = best_weighted.model

first_scoring = first_scoring.drop(["model", "points"], axis=1)
second_scoring = second_scoring.drop(["model", "points"], axis=1)
best_scoring = best_scoring.drop(["model", "points"], axis=1)
best_weighted = best_weighted.drop(["model", "points"], axis=1)

In [None]:
latex_results = first_scoring.join(second_scoring).join(best_scoring).join(best_weighted).sort_values(by="First places", ascending=False)
latex_results["First places for best"] = latex_results["First places for best"].replace(np.nan, 0).astype(int)
latex_results["First places for best"] = latex_results["First places for best"].replace(0, "")
latex_results["Weighted scoring for best"] = latex_results["Weighted scoring for best"].replace(np.nan, 0).astype(int)
latex_results["Weighted scoring for best"] = latex_results["Weighted scoring for best"].replace(0, "")
print(latex_results.to_latex())

### Latex formatting for the figures that we include in the article

In [None]:
from matplotlib.colors import ListedColormap
# Use Abzu colors

abzu_rgba = {
    'Dark Jungle Green': (0.11764705882352941, 0.11764705882352941, 0.11764705882352941, 1.0),
    'Golden Yellow': (1.0, 1.0, 0.0392156862745098, 1.0),
    'Guppie Green': (0.0, 0.9411764705882353, 0.5098039215686274, 1.0),
    'Hot Magenta': (1.0, 0.11764705882352941, 0.7843137254901961, 1.0),
    'Majorelle Blue': (0.27450980392156865, 0.27450980392156865, 0.9019607843137255, 1.0),
    'Robin Egg Blue': (0.0, 0.7843137254901961, 0.7843137254901961, 1.0),
    'Safety Orange': (1.0, 0.39215686274509803, 0.0392156862745098, 1.0),
    'Spiro Disco Ball': (0.0392156862745098, 0.7058823529411765, 0.9803921568627451, 1.0)
}

def get_abzu_cmap(cname):
    r, g, b, _ = abzu_rgba[cname]
    ret = []
    for alphaval in np.linspace(0.03, 0.9, 256):
        ret.append([r, g, b, alphaval])
    return ListedColormap(np.array(ret))

In [None]:
import matplotlib as mpl

mpl.rc('xtick',labelsize=10)
mpl.rc('ytick',labelsize=10)

mpl.rc('axes',labelsize=15)
mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
mpl.rc('text', usetex=True)

In [None]:
fig, ax = plt.subplots(figsize=(4,4))

chosen_datasets.plot.scatter(
    x="n",
    y="fcount",
    loglog=True,
    ylabel="Number of features",
    xlabel="Number of observations",
    color=abzu_rgba['Majorelle Blue'],
    s=40,
    ax=ax
)
plt.tight_layout()
#plt.savefig("/home/jaan/devel/regressionArticle/jmlr/manuscript/figures/dataset_summary.pdf")

## Search for a results figure

In [None]:
best_models = [
    "QG-11-bic", 
    "GradientBoostingRegressor(n_estimators=400)", 
    "RandomForestRegressor(n_estimators=400)",
    "Lasso(alpha=0.01, max_iter=100000)",
    "DecisionTreeRegressor(max_depth=2)"
    ]

In [None]:
best_results = results[results.model.isin(best_models)]

In [None]:
best_results

In [None]:
agg_dict = {'train_r2': 'mean', 'test_r2': 'mean'}
sr_results = best_results[best_results.model.apply(lambda x: 'QG' in x)].groupby('dataset').agg(agg_dict)
dt_results = best_results[best_results.model.apply(lambda x: 'DecisionTree' in x)].groupby('dataset').agg(agg_dict)
gb_results = best_results[best_results.model.apply(lambda x: 'GradientBoosting' in x)].groupby('dataset').agg(agg_dict)
rf_results = best_results[best_results.model.apply(lambda x: 'RandomForest' in x)].groupby('dataset').agg(agg_dict)
lr_results = best_results[best_results.model.apply(lambda x: 'Lasso' in x)].groupby('dataset').agg(agg_dict)

In [None]:
f'{sr_results=}'.split('=')[0]

In [None]:
(sr_results.test_r2 < 0).sum()

In [None]:
# Number of negative test R^2 results
for name, method_df in {"sr": sr_results, "dt": dt_results, "gb": gb_results, "rf": rf_results, "lr": lr_results}.items():
    print(name, (method_df.test_r2 < 0).sum())
    
del name
del method_df

In [None]:
def df_to_cloud(df):
    return np.vstack([df.train_r2.ravel(), df.test_r2.ravel()])

In [None]:
Xr, Yr = np.mgrid[0:1:0.01, 0:1:0.01]
all_positions = np.vstack([Xr.ravel(), Yr.ravel()])

In [None]:
list(abzu_rgba)

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(12, 8))
for sax in ax.flatten():
    sax.set_ylim(0,1)
    sax.set_xlim(0,1)
    #sax.set_xlabel(r'Train $R^2$')
    #sax.set_ylabel(r'Test $R^2$')
    
fig.text(0.5, 0.04, r'Training $R^2$', ha='center', size=20)
fig.text(0.04, 0.5, r'Validation $R^2$', va='center', rotation='vertical', size=20)

scatter_density = np.reshape(gaussian_kde(df_to_cloud(sr_results))(all_positions).T, Xr.shape)
ax[0,0].imshow(np.rot90(scatter_density), extent=[0,1,0,1], alpha=.5, cmap=get_abzu_cmap('Spiro Disco Ball'))
ax[0,0].plot(np.linspace(0,1,100), np.linspace(0,1,100), ls="--", color="black", alpha=0.4)
ax[0,0].scatter(sr_results.train_r2, sr_results.test_r2, color=abzu_rgba['Spiro Disco Ball'])
ax[0,0].set_title(r"\textbf{(A)} QLattice with BIC")

scatter_density = np.reshape(gaussian_kde(df_to_cloud(lr_results))(all_positions).T, Xr.shape)
ax[0,1].imshow(np.rot90(scatter_density), extent=[0,1,0,1], alpha=.5, cmap=get_abzu_cmap('Guppie Green'))
ax[0,1].plot(np.linspace(0,1,100), np.linspace(0,1,100), ls="--", color="black", alpha=0.4)
ax[0,1].scatter(lr_results.train_r2, lr_results.test_r2, color=abzu_rgba['Guppie Green'])
ax[0,1].set_title(r"\textbf{(B)} Lasso Regression ($\alpha=0.1$)")

scatter_density = np.reshape(gaussian_kde(df_to_cloud(dt_results))(all_positions).T, Xr.shape)
ax[0,2].imshow(np.rot90(scatter_density), extent=[0,1,0,1], alpha=.5, cmap=get_abzu_cmap('Safety Orange'))
ax[0,2].plot(np.linspace(0,1,100), np.linspace(0,1,100), ls="--", color="black", alpha=0.4)
ax[0,2].scatter(dt_results.train_r2, dt_results.test_r2, color=abzu_rgba['Safety Orange'])
ax[0,2].set_title(r"\textbf{(C)} Decision Tree (Max depth 2)")

scatter_density = np.reshape(gaussian_kde(df_to_cloud(gb_results))(all_positions).T, Xr.shape)
ax[1,0].imshow(np.rot90(scatter_density), extent=[0,1,0,1], alpha=.5, cmap=get_abzu_cmap('Hot Magenta'))
ax[1,0].plot(np.linspace(0,1,100), np.linspace(0,1,100), ls="--", color="black", alpha=0.4)
ax[1,0].scatter(gb_results.train_r2, gb_results.test_r2, color=abzu_rgba['Hot Magenta'])
ax[1,0].set_title(r"\textbf{(D)} Gradient Boosting (400)")

scatter_density = np.reshape(gaussian_kde(df_to_cloud(rf_results))(all_positions).T, Xr.shape)
ax[1,1].imshow(np.rot90(scatter_density), extent=[0,1,0,1], alpha=.5, cmap=get_abzu_cmap('Majorelle Blue'))
ax[1,1].plot(np.linspace(0,1,100), np.linspace(0,1,100), ls="--", color="black", alpha=0.4)
ax[1,1].scatter(rf_results.train_r2, rf_results.test_r2, color=abzu_rgba['Majorelle Blue'])
ax[1,1].set_title(r"\textbf{(E)} Random Forest (400)")

ax[1,2].set_axis_off()

#plt.savefig("/home/jaan/devel/regressionArticle/jmlr/manuscript/figures/scatters.pdf")