In [None]:
import sys

# sys.path.clear()  # Clear all paths
# sys.path.remove('./path/to/remove') # Remove a selected path

# Basepath
basepath = "../"  # Project directory
sys.path.append(basepath)

# Active Learning path
AL_PATH = basepath + "04_Active_Learning/"

# Data
DATA_PATH = basepath + "data/"

# Path to conda environment
ENV_PATH = "/home/fhwn.ac.at/202375/.conda/envs/thesis/lib"

# Resultspath
RESULTS_PATH = AL_PATH + "results/"

# Figure
FIGURE_PATH = RESULTS_PATH + "figures/"

# AL Scripts
AL_SCRIPTS_PATH = basepath + "al_scripts"

# Logging
LOG_DIR = AL_PATH + "logs/"

# Add the paths
sys.path.extend(
    {DATA_PATH, FIGURE_PATH, ENV_PATH, RESULTS_PATH, AL_SCRIPTS_PATH}
)

# remove a selected path
# sys.path.remove('')

sys.path  # Check if the path is correct

### Logging

In [None]:
LOG_DIR

In [None]:
# import the logging specifications from file 'logging_config.py'
from al_lib.logging_config import create_logger
import datetime

# Add data/time information
now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d")

# Define the notebook name and the output name
notebook_name = "04_active_learning_pls.ipynb"  # Is also used when saving the notebook
output_name = f"{notebook_name.split('.')[0]}_{date}.html"

# Specify logging location
log_file_name = f"{notebook_name.split('.')[0]}_{date}.log"
log_file_dir = f"{LOG_DIR}"
log_file_path = f"{LOG_DIR}/{log_file_name}"
# print(f"Log file path: {log_file_path}")

# Get the logger
# logger = None
logging = create_logger(__name__, log_file_path=log_file_path)

# Usage of the logger as follows:
logging.info("Logging started")

## Import Packages

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.kernel_ridge import KernelRidge as KRR
import pandas as pd

In [None]:
## Turn of sklearn warnings
from warnings import simplefilter
import warnings

from sklearn.exceptions import ConvergenceWarning

simplefilter("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", message=".*sklearn/cross_decomposition/_pls.py:336: UserWarning: y residual is constant*.", category=UserWarning, append = False)

### Import Data

#### Import PS20191107_2deriv_gegl.csv

In [None]:
# Import 2nd_deriv

data_2nd_deriv_raw = pd.read_csv(
    DATA_PATH + "/PS20191107_2deriv_gegl.csv",
    on_bad_lines="skip",
    sep=";",
    decimal=",",
    encoding="utf-8",
)

# plot distribution of age in histogram
plt.hist(data_2nd_deriv_raw["year"], bins=50)


In [None]:
# reduction of the dataset to exlude samples with age larger than >-5000

# count the amount of samples with age larger than -5000
logging.info(f"Amount of samples with age older than -6000: {len(data_2nd_deriv_raw[data_2nd_deriv_raw['year'] < -6000])}")
# remove these samples
data_2nd_deriv_raw = data_2nd_deriv_raw[data_2nd_deriv_raw['year'] > -6000]
# plot distribution of age in histogram
plt.hist(data_2nd_deriv_raw["year"], bins=50)
plt.xlabel("year")
plt.ylabel("count")
plt.title("Distribution of age after removal of samples with year < -6000")
plt.show()


In [None]:
data_2nd_deriv = data_2nd_deriv_raw.rename(columns={"Unnamed: 0": "Name"})

# Convert all columns of type 'object' to 'float' or 'int' if possible
for column in data_2nd_deriv.columns:
    # change datatype from the 'year' column to 'int
    if column == "year":
        data_2nd_deriv[column] = data_2nd_deriv[column].astype("int")
        print(f"'{column}' has been converted to 'int'.")
        # skip the rest of the loop
        continue
    try:
        data_2nd_deriv[column] = data_2nd_deriv[column].astype("float")
        # data_small.select_dtypes(include=['object']).astype('float')
    except ValueError:
        print(f"'{column}' could not be converted. Continue with other column(s).")
    except TypeError:
        print(f"'{column}' could not be converted. Continue with other column(s).")

### Import dpsDeriv1200.csv

In [None]:
data_dps_deriv_1200 = pd.read_csv(
    DATA_PATH + "/dpsDeriv1200.csv", sep=",", decimal=".", encoding="utf-8"
)
data_dps_deriv_1200 = data_dps_deriv_1200.rename(columns=lambda x: x.replace("X", ""))
data_dps_deriv_1200

In [None]:
# Define generarl settings

# Switch for the dataset
# Select from (data_1200, data_full) or other if implemented
data = data_dps_deriv_1200

# Switch for testing mode (use only 10% of the data, among others)
testing = True

# Define a random state for randomized processes
# random_state = np.random.RandomState(202375)
random_state = 202375

######################################################
if testing == True:
    n_jobs = 20
    print("Testing mode for Cross Validation")
    print("consider Splitting the data for faster modelling")
else:
    n_jobs = 40
    print("Extensive mode for Cross Validation")
######################################################

## Split into feature and target variables

In [None]:
from sklearn.model_selection import train_test_split

X = data.select_dtypes("float")
y = data["year"]
X.shape, y.shape

## Validation

since not every regression method is able to estimate its prediction accuracy, a split of the data is retained as validation set. 

In [None]:
# count the number of columns with std = 0.0 in X
logging.info(f"Number of columns dropped, where std = 0.0 in X: {(X.std() == 0.0).sum()}")

# drop the columns with std = 0.0
X = X.loc[:, X.std() != 0.0]
logging.info(f"Dimensions of X after dropping columns with std = 0.0: {X.shape}")
logging.info(f"Dimensions of Y: {y.shape}")

In [None]:
# retain 10% of the data for validation
(
    X_remainder,
    X_val,
    y_remainder,
    y_val,
) = train_test_split(X, y, test_size=0.1, random_state=random_state)

# split the remainder into training and test (30%) set
X_train, X_test, y_train, y_test = train_test_split(
    X_remainder, y_remainder, test_size=0.3, random_state=random_state
)
logging.info(f"Split of the dataset into Training, Test and Validation set")

In [None]:
# assert the shapes and raise an error if they are not equal
assert X_train.shape[0] + X_test.shape[0] + X_val.shape[0]== X.shape[0]
assert y_train.shape[0] + y_test.shape[0] + y_val.shape[0]== y.shape[0]

In [None]:
# calculate percentage for each set

# set the number of decimal places
calc = lambda x: round(x / len(X), 2)

# log the information
logging.info(f"Training set: {len(X_train)} ({calc(len(X_train))*100}%)")
logging.info(f"Test set: {len(X_test)} ({calc(len(X_test))*100}%)")
logging.info(f"Validation set:{len(X_val)} ({calc(len(X_val))*100}%)")

## Define Score metrics

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import root_mean_squared_error

# create a scorer which calculates Root Mean Squeared Error (RMSE)

scoring = make_scorer(root_mean_squared_error, greater_is_better=False)
# scoring = make_scorer(mean_squared_error, greater_is_better=False, squared=False)
logging.info(f"Scorer: {scoring}")

## Implementation of AL with IDW (Bemporad, 2023).

This implementation tries to implement the IDW starting from 3.1 Initialization

In [None]:
# select N samples for initial modelling via kmeans clustering
# the distances to the cluster centers can be used to select the most
# informative samples for the initial training set

n_samples = len(X_train) // 100  # 1% of the training set
# // floor devision - rounds to the nearest whole number

from sklearn.cluster import KMeans

# alternatively the number of initial clusters can be defined manually
kmeans_clusters = n_samples

# define the kmeans model
kmeans = KMeans(n_clusters=n_samples, random_state=random_state)
# run the kmeans model on the training set
kmeans.fit(X_train)
# get the cluster centers
cluster_centers = kmeans.cluster_centers_

# calculate the distance of each sample to the cluster centers as
# squared (scaled) euclidean distance
from sklearn.metrics.pairwise import euclidean_distances

distances = euclidean_distances(X_train, cluster_centers, squared=True)

stepsize = 1

# return the indizes of the samples that are closest to the cluster centers
dist_sq_eu_argmin = np.argmin(distances, axis=1)

# plot the distribution of the samples
plt.hist(dist_sq_eu_argmin, bins=n_samples)
#
plt.title("Distribution of samples")
plt.xlabel("Cluster")
plt.ylabel("Number of samples")
plt.savefig(FIGURE_PATH + "/kmeans_cluster_distr.png")
plt.show()

# plot the distances of the samples to the cluster centers
plt.hist(np.min(distances, axis=1), bins=100)
plt.title("Distribution of distances")
plt.xlabel("Distance")
plt.ylabel("Number of samples")
plt.savefig(FIGURE_PATH + "/kmeans_cluster_dist.png")
plt.show()

In [None]:
# create a pca plot of the samples
from sklearn.decomposition import PCA

# store the explained variance ratio
explained_variance_ratio = []

for i in range(2, 14):
    pca = PCA(n_components=i, random_state=random_state)
    X_train_pca = pca.fit_transform(X_train)
    explained_variance_ratio.append(pca.explained_variance_ratio_)
    # retain the information if i == 2 for plotting 2 Dimensional PCA
    if i == 2:
        X_train_pca_2 = X_train_pca

# calculate the sum of explained var for each pca component

# show the table of explained variance ratio
explained_variance_ratio = pd.DataFrame(explained_variance_ratio)

explained_variance_ratio["explained_variance"] = explained_variance_ratio.sum(axis=1)

explained_variance_ratio.head(n=14)

In [None]:
# create the elbow plot

# plot the sum of explained variance ratio against number of components
plt.plot(range(2, 14), explained_variance_ratio["explained_variance"])
plt.title("Explained variance ratio by number of components")
plt.xlabel("Number of components")
plt.ylabel("Explained variance ratio")

plt.show

In [None]:
# plot the distances from the cluster centers in 2D
# be aware that the cluster centers were calculated in the original feature space

plt.scatter(X_train_pca_2[:, 0], X_train_pca_2[:, 1], c=dist_sq_eu_argmin, alpha=0.5)
# transparency of the mar

# highlight the cluster centers
plt.scatter(
    pca.transform(cluster_centers)[:, 0], # transform the cluster centers to the pca space for pc 1
    pca.transform(cluster_centers)[:, 1],
    c="red",
    marker="x",
)
plt.title("PCA plot samples and cluster centers")
plt.xlabel("Principial Component 1")
# reduce the size of markers on the x-axis
plt.xticks(fontsize=8)
plt.ylabel("Principial Component 2")

# add a legend explaining the color mapping
plt.legend(["Samples", "Cluster Centers"])
# save the plot
plt.savefig(FIGURE_PATH + "/pca_plot_generic.png")
plt.show()

In [None]:
# dist_sq_eu_argmin is the index/name of the cluster center that is closest to each sample
unique, counts = np.unique(dist_sq_eu_argmin, return_counts=True)

# generation of a histogram over the sample distribution in the clusters
plt.bar(unique, counts)
plt.title("Number of Samples in each cluster")
plt.xlabel("Cluster")
plt.ylabel("Number of samples")
plt.savefig(FIGURE_PATH + "/cluster_distribution.png")
plt.show


In [None]:
# retrieve the sample id with the smallest distance to any cluster center
# the corresponding sample should be selected as the first sample
id = np.argmin(np.min(distances, axis=1))
id_max = np.argmax(np.max(distances, axis=1))
print("Closest Sample: ", id, dist_sq_eu_argmin[id], np.min(dist_sq_eu_argmin[id]))
print("Farthes Sample: ", id_max, dist_sq_eu_argmin[id_max], np.min(dist_sq_eu_argmin[id_max]))

# pcaplot with the highlighted sample
plt.scatter(X_train_pca_2[:, 0], X_train_pca_2[:, 1], c=dist_sq_eu_argmin, alpha=0.5)

# highlight the cluster centers
plt.scatter(pca.transform(cluster_centers)[:, 0], pca.transform(cluster_centers)[:, 1], c="red", marker="x")

# highlight the selected sample
plt.scatter(X_train_pca_2[id, 0], X_train_pca_2[id, 1], c="pink", marker="o")
plt.scatter(X_train_pca_2[id_max, 0], X_train_pca_2[id_max, 1], c="black", marker="o")

# Optional plotting possibilities
# annotate the closest sample
# plt.annotate(f"Sample {id}", (X_train_pca_2[id, 0], X_train_pca_2[id, 1]))
# plt.annotate(f"Sample {id_max}", (X_train_pca_2[id_max, 0], X_train_pca_2[id_max, 1]))
# center on the selected sample
# plt.xlim(X_train_pca_2[id, 0] - 0.5, X_train_pca_2[id, 0] + 0.5)
# plt.ylim(X_train_pca_2[id, 1] - 0.5, X_train_pca_2[id, 1] + 0.5)

plt.title("PCA closest sample to cluster center")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(["Samples", "Cluster Center", f"Closest Sample (ID:{id})", f"Farthest Sample (ID:{id_max})"])

# save the plot
plt.savefig(FIGURE_PATH + "/pca_plot_highligh_MinMaxDist.png")
plt.show()

In [None]:
# retrive the indizes of the samples that are closest to each cluster center
kmeans_clusters = n_samples

id = []
initial_samples_sample_ids = []
for i in range(kmeans_clusters):
    id.append(np.where(dist_sq_eu_argmin == i)[0])

# get the first sample of each cluster
for i in range(kmeans_clusters):
    initial_samples_sample_ids.append(id[i][0])
initial_samples_sample_ids

## Active learning step (first step)

##  Introduction

For each Active Learning we will start with an inital set of samples which allow to generate a model. Based on the employed strategy, additional samples will be added and the model updated. 

**Inital Models**  
[X] random samples  
[X] kmean cluster centers  

**Sampling strategies**  
[X] random sampling  
[ ] euclidean distance  

## Setup

The Active Learning Process in my Workflow recieves the hyperparameters for the model inplementation from the previous process (RSCV and GSCV). 

### Import optimal model parameters from csv

In [None]:
# from 03_1_rscv/rscv_results import pls_rscv_results.csv

# Load the results from the RandomizedSearchCV
pls_rscv_results = pd.read_csv("/home/fhwn.ac.at/202375/Zamberger_thesis_AL2024/03_Modelling/03_1_rscv/rscv_results/pls_rscv_results.csv",
    sep=",",
    decimal=".",
    encoding="utf-8",
)
# identify the best parameters via the rmse
params_pls = eval(pls_rscv_results.loc[pls_rscv_results["RMSE"].idxmin()].params)
params_pls

In [None]:
# retrive the best parameters as a dictionary

# params_pls = {'n_components': 45, 'max_iter':321}

In [None]:
# Inital model with random samples and pandas dataframes

from sklearn.cross_decomposition import PLSRegression
from al_lib.helper_functions import rmse_func

pls = PLSRegression(**params_pls)

X_Pool = X_train
y_Pool = y_train
n_samples = 8

X_Learned = None
y_Learned = None

X_Pool, X_Learned = train_test_split(
    X_Pool, test_size=n_samples, random_state=random_state
)
y_Pool, y_Learned = train_test_split(
    y_Pool, test_size=n_samples, random_state=random_state
)

pls.fit(X_Learned, y_Learned)
# calc initial rmse
y_pred = pls.predict(X_test)
rmse = rmse_func(y_test, y_pred)
print(f"Initial RMSE with random sampling: {rmse}, with {len(y_Learned)} samples")

## Active Learning with random sampling

To the initial model, additional samples will be added in a randomized way. This will be used later as baseline to compare strategies.

In [None]:
# Active Learning (PLS) with random sampling

pls = PLSRegression(**params_pls)

X_Pool = X_train
y_Pool = y_train
n_samples = 8

X_Learned = None
y_Learned = None

X_Pool, X_Learned, y_Pool, y_Learned = train_test_split(
    X_Pool, y_Pool, test_size=n_samples, random_state=random_state
)

pls.fit(X_Learned, y_Learned)
# calc initial rmse
y_pred = pls.predict(X_test)
rmse = rmse_func(y_test, y_pred)
print(f"Initial RMSE with random sampling: {rmse}, with {len(y_Learned)} samples")

# define the number of iterations
n_iterations = 500

# define the number of samples to be selected in each iteration
n_samples_per_iteration = 1

# to track the samples, we will generate lists, with all the indexes

X_Learned_index = X_Learned.index
# these should be the same as the y_Learned index
y_Learned_index = y_Learned.index
X_Pool_index = X_Pool.index
y_Pool_index = y_Pool.index

rmse_sampling = np.zeros(n_iterations)

# Active Learning Loop
for it in range(n_iterations):
    # select a random sample from the pool by selecting a random index from X_Pool
    random_sample_index = np.random.choice(X_Pool.index)

    x_new = X_Pool.loc[[random_sample_index]]
    y_new = y_Pool.loc[[random_sample_index]]
    # add the sample to the learned set
    X_Learned = pd.concat([X_Learned, x_new])
    y_Learned = pd.concat([y_Learned, y_new])
    # remove the sample from the pool
    X_Pool = X_Pool.drop(index=random_sample_index)
    y_Pool = y_Pool.drop(index=random_sample_index)
    # retrain model on the new full data set and predict a new fit
    pls.fit(X_Learned, y_Learned)
    y_pred = pls.predict(X_test)
    rmse_sampling[it] = rmse_func(y_test, y_pred)
    # print(f"RMSE after iteration {it+1}: {rmse_sampling[it]}, with {len(y_Learned)} samples")

print(
    f"Final RMSE with random sampling: {rmse_sampling[-1]}, with {len(y_Learned)} samples"
)
rmse_sampling_random_start = rmse_sampling

# plot the rmse over the iterations
plt.plot(range(n_iterations), rmse_sampling)
plt.title(
    "Random Sampling with Random Samples as starting points \n RMSE over iterations"
)
plt.xlabel("Iteration")
plt.ylabel("RMSE")
plt.show()

In [None]:
# Active Learning (PLS) with Cluster Centers as starting points

pls = PLSRegression(**params_pls)

X_Pool = X_train
y_Pool = y_train

X_Learned = None
y_Learned = None

# add the samples nearest to cluster centers to the learned set
X_Learned = X_Pool.iloc[initial_samples_sample_ids]
y_Learned = y_Pool.iloc[initial_samples_sample_ids]

# remove the samples from the pool
X_Pool = X_Pool.drop(X_Learned.index)
y_Pool = y_Pool.drop(y_Learned.index)

pls.fit(X_Learned, y_Learned)
# calc initial rmse
y_pred = pls.predict(X_test)
rmse_init = rmse_func(y_test, y_pred)
print(f"Initial RMSE with random sampling: {rmse_init}, with {len(y_Learned)} samples")

# define the number of iterations
n_iterations = 500

# define the number of samples to be selected in each iteration
n_samples_per_iteration = 1

# to track the samples, we will generate lists, with all the indexes
X_Learned_index = X_Learned.index
y_Learned_index = y_Learned.index
X_Pool_index = X_Pool.index
y_Pool_index = y_Pool.index

rmse_sampling = np.zeros(n_iterations)

# Active Learning Loop
for it in range(n_iterations):
    # select a random sample from the pool by selecting a random index from X_Pool
    random_sample_index = np.random.choice(X_Pool.index)
    x_new = X_Pool.loc[[random_sample_index]]
    y_new = y_Pool.loc[[random_sample_index]]
    # add the sample to the learned set
    X_Learned = pd.concat([X_Learned, x_new])
    y_Learned = pd.concat([y_Learned, y_new])
    # remove the sample from the pool
    X_Pool = X_Pool.drop(index=random_sample_index)
    y_Pool = y_Pool.drop(index=random_sample_index)
    # retrain model on the new full data set and predict a new fit
    pls.fit(X_Learned, y_Learned)
    y_pred = pls.predict(X_test)
    rmse_sampling[it] = rmse_func(y_test, y_pred)
    # print(f"RMSE after iteration {it+1}: {rmse_sampling[it]}, with {len(y_Learned)} samples")

print(
    f"Final RMSE with random sampling: {rmse_sampling[-1]}, with {len(y_Learned)} samples"
)

rmse_sampling_cluster_start = rmse_sampling

# plot the rmse over the iterations
plt.plot(range(n_iterations), rmse_sampling)
plt.title(
    "Random Sampling with ClusterCenters as starting points \n RMSE over iterations"
)
plt.xlabel("Iteration")
plt.ylabel("RMSE")
plt.show()

In [None]:
# combine the two plots
# plot the rmse over the iterations
plt.plot(range(n_iterations), rmse_sampling_cluster_start)
plt.plot(range(n_iterations), rmse_sampling_random_start)
plt.title(
    "Comparison Random vs Cluster start (with random sampling) \n RMSE over iterations"
)
plt.legend(["Cluster Start", "Random Start"])
plt.xlabel("Iteration")
plt.ylabel("RMSE")
plt.show()

# Active Learning with Random sampling - Defining a function

In [None]:
# PLS regression with random sampling as selection criterion

def pls_random(X_train, y_train, n_iterations, params_pls, n_samples_per_it = None):
    """
    Function to perform PLS regression with random sampling as selection criterion
    """
    if n_samples_per_it is None:
        n_samples_per_it = 1
    # initialize the model
    # Define the Input
    # Data
    X_Pool = X_train
    y_Pool = y_train
    # Model
    pls = PLSRegression(**params_pls)

    # Number of iterations of active learning
    if n_iterations is None:
        n_iterations = 50
        
    # Define the Output
    # RMSE for each iteration
    # Index of the samples selected in each iteration

    rmse_sampling = np.zeros(n_iterations)
    samples_selected = np.zeros(n_iterations)

    # Active Learning Loop
    for it in range(n_iterations):
        # select a random sample from the pool by selecting a random index from X_Pool
        random_sample_index = np.random.choice(X_Pool.index)
        samples_selected[it] = random_sample_index

        x_new = X_Pool.loc[[random_sample_index]]
        y_new = y_Pool.loc[[random_sample_index]]

        # if it = 0, initialize the learned set as empty
        if it == 0:
            X_Learned = pd.DataFrame()
            y_Learned = pd.DataFrame()

        # add the sample to the learned set
        X_Learned = pd.concat([X_Learned, x_new])
        y_Learned = pd.concat([y_Learned, y_new])
        # remove the sample from the pool
        X_Pool = X_Pool.drop(index=random_sample_index)
        y_Pool = y_Pool.drop(index=random_sample_index)
        # retrain model on the new full data set and predict a new fit, if the n_samples_per_it is reached
        if n_samples_per_it == None:
            pls.fit(X_Learned, y_Learned)
            y_pred = pls.predict(X_test)
            rmse_sampling[it] = rmse_func(y_test, y_pred)
            # print(f"RMSE after iteration {it+1}: {rmse_sampling[it]}, with {len(y_Learned)} samples")
        if it % n_samples_per_it == 0 and it != 0:
            pls.fit(X_Learned, y_Learned)
            y_pred = pls.predict(X_test)
            rmse_sampling[it] = rmse_func(y_test, y_pred)
            # print(f"RMSE after iteration {it+1}: {rmse_sampling[it]}, with {len(y_Learned)} samples")
        # print(f"RMSE after iteration {it+1}: {rmse_sampling[it]}, with {len(y_Learned)} samples")

    print(
        f"Final RMSE with random sampling: {round(rmse_sampling[-1], 3)}, with {len(y_Learned)} samples"
    )

    # calc the rmse for the model including all training data
    pls.fit(X_train, y_train)
    y_pred = pls.predict(X_test)
    rmse_full = rmse_func(y_test, y_pred)
    print(f"RMSE with all training samples: {round(rmse_full, 3)} (with training on {len(y_train)} samples)")

    # plot the rmse over the iterations
    plt.plot(range(n_iterations)[1:], rmse_sampling[1:])
    # add a line for the model with all training samples
    plt.axhline(y=rmse_full, color="r", linestyle="--")
    plt.title(
        "Random Sampling with PLS Model\n Random Samples as starting points \n RMSE over iterations"
    )
    plt.xlabel("Iteration")
    plt.ylabel("RMSE")
    plt.show()

    # create a plot of the rmse as accuracy over the iterations

    #define max acc as rmse_full
    max_acc = rmse_full
    # calculate the accuracy for each iteration
    rmse_acc = max_acc - rmse_sampling
    # plot the accuracy over the iterations
    plt.plot(range(n_iterations)[1:], rmse_acc[1:])
    plt.axhline(y=0, color="r", linestyle="--") # rmse_full == max_acc
    plt.xlim(1, n_iterations)
    plt.ylim(np.min(rmse_acc) ,0+10)
    plt.title(
        "Random Sampling with PLS Model\n Random Samples as starting points \n RMSE over iterations"
    )
    plt.xlabel("Iteration")
    plt.ylabel("Accuracy")
    plt.show()
    # return rmse_sampling, samples_selected


In [None]:
pls_random(X_Pool, y_Pool, 500, params_pls = params_pls) 

## GSx

The greedy sampling technique can be described as a max distance sampling. Therfor select samples which are most distant from each sample of the test set. The index specifies the 'direction' in which the samples are measured, X refers to the known Variables and y to the targets.

In [None]:
# PLS regression with GSx as selection criterion

# Number of iterations of active learning
n_iterations = 500

def pls_gsx(X_train, y_train, params_pls, n_iterations = None, n_samples_per_it = None, init_sample_size = None):
    """
    Function to perform PLS regression with GSx as selection criterion
    TODO
    """
    if n_samples_per_it is None:
        n_samples_per_it = 1
    if init_sample_size is None:
        init_sample_size = 10
    if n_iterations is None:
        n_iterations = 50
    # initialize the model
    # Define the Input
    # Data
    X_Pool = X_train
    y_Pool = y_train
    # Model
    pls = PLSRegression(**params_pls)

    # Define the Output
    # RMSE for each iteration
    # Index of the sample selected (in each iteration)

    rmse_sampling = np.zeros(n_iterations)
    samples_selected = np.zeros(n_iterations)
    #initialize the learned set as a empty dataframe
    X_Learned = pd.DataFrame()
    y_Learned = pd.Series()
    # add initial samples to the learned set (random)
    for _ in range(init_sample_size):
        random_sample_index = np.random.choice(X_Pool.index)
        x_new = X_Pool.loc[[random_sample_index]]
        y_new = y_Pool.loc[[random_sample_index]]
        X_Learned = pd.concat([X_Learned, x_new], ignore_index=True)
        y_Learned = pd.concat([y_Learned, y_new], ignore_index=True)
        X_Pool = X_Pool.drop(index=random_sample_index)
        y_Pool = y_Pool.drop(index=random_sample_index)
    assert all (y_Learned.index == X_Learned.index)

    # Active Learning Loop
    for it in range(n_iterations):
        # Greedy Sampling by Euclidean Distance
        # select the sample from X_Pool, where the euclidean distance to the samples in X_Pool is the largest
        # this is done by calculating the euclidean distance between the samples in X_Pool and the samples in X_Learned
        # the sample with the largest distance is selected
        distances = euclidean_distances(X_Pool, X_Learned) # distances : ndarray of shape (n_samples_X, n_samples_Y)
        distances_df = pd.DataFrame(distances, index=X_Pool.index, columns=X_Learned.index)
        # select the sample with the largest distance
        sample_id = distances_df.sum(axis = 1).idxmax()
        samples_selected[it] = sample_id
        #retrieve the sample from the pool
        x_new = X_Pool.loc[[sample_id]]
        y_new = y_Pool.loc[[sample_id]]
       
        # add the sample to the learned set
        X_Learned = pd.concat([X_Learned, x_new],ignore_index=True)
        y_Learned = pd.concat([y_Learned, y_new],ignore_index=True)
        # remove the sample from the pool
        X_Pool = X_Pool.drop(index=sample_id)
        y_Pool = y_Pool.drop(index=sample_id)
        # retrain model on the new full data set and predict a new fit, if the n_samples_per_it is reached
        if n_samples_per_it == None:
            pls.fit(X_Learned, y_Learned)
            y_pred = pls.predict(X_test)
            rmse_sampling[it] = rmse_func(y_test, y_pred)
            # print(f"RMSE after iteration {it+1}: {rmse_sampling[it]}, with {len(y_Learned)} samples")
        if it % n_samples_per_it == 0 and it != 0:
            pls.fit(X_Learned, y_Learned)
            y_pred = pls.predict(X_test)
            rmse_sampling[it] = rmse_func(y_test, y_pred)
            # print(f"RMSE after iteration {it+1}: {rmse_sampling[it]}, with {len(y_Learned)} samples")
        # print(f"RMSE after iteration {it+1}: {rmse_sampling[it]}, with {len(y_Learned)} samples")

    print(
        f"Final RMSE with GSx sampling: {round(rmse_sampling[-1], 3)}, with {len(y_Learned)} samples"
    )
    
    gsx_rmse_sampling = rmse_sampling
    # calc the rmse for the model including all training data
    pls.fit(X_train, y_train)
    y_pred = pls.predict(X_test)
    rmse_full = rmse_func(y_test, y_pred)
    print(f"RMSE with all training samples: {round(rmse_full, 3)} (with training on {len(y_train)} samples)")

    # plot the rmse over the iterations
    plt.plot(range(n_iterations)[1:], rmse_sampling[1:])
    # add a line for the model with all training samples
    plt.axhline(y=rmse_full, color="r", linestyle="--")
    plt.title(
        "GSx Sampling with PLS Model\n Random Samples as starting points \n RMSE over iterations"
    )
    plt.xlabel("Iteration")
    plt.ylabel("RMSE")
    plt.show()
    return gsx_rmse_sampling
        
gsx_rmse_sampling = pls_gsx(X_Pool, y_Pool, params_pls = params_pls, n_iterations = 500)

In [None]:
# Compare random sampling with GSx sampling

# plot the rmse over the iterations
plt.plot(range(n_iterations), rmse_sampling_cluster_start)
plt.plot(range(n_iterations), rmse_sampling_random_start)
plt.plot(range(n_iterations), gsx_rmse_sampling)
plt.title(
    "Comparison Random vs Cluster start (with random sampling) \n RMSE over iterations"
)
plt.legend(["Random(Cluster Start)", "Random(Random Start)", "GSx"])
plt.xlabel("Iteration")
plt.ylabel("RMSE")
plt.show()