# Introduction

The following tutorial will exemplify the  Active Learning for Regression Using Greedy Sampling from Dongrui Wu, Chin-Teng Lin, Jian Huang.


# Set Up

## Import Packages

In [None]:
### Import Packages ###

import os
import math as math
import pandas as pd
import random as random

### Append Path ###
import sys
sys.path.append('..')

### Local Packages ###
from utils.Prediction import *
from utils.Selector import *
from utils.Auxiliary import *
from utils.Main import *

## Load Data

In [None]:
### Data ###
from sklearn.datasets import fetch_openml

boston = fetch_openml(name="boston", version=1, as_frame=True)
df = pd.DataFrame(data=boston.data, columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'])
df = df.drop(columns=["CHAS", "RAD"])
df['Y'] = boston.target

## Input Parameters

In [None]:
### Get Directory ###
cwd = os.getcwd()
ParentDirectory = os.path.abspath(os.path.join(cwd, ".."))

### Input ###
SimulationConfigInput = {'Seed': 1,                     # Seed
    'TestProportion': 0.2,                              # Test proportion
    'CandidateProportion': 0.8,                         # Candidate proportion
    'SelectorType': 'GSxFunction',                      # Options: [GSxFunction, GSyFunction, iGSFunction, PassiveLearning] (all of these are for regression)
    'ModelType': 'RandomForestRegressorFunction',       # Options: [LinearRegressionFunction, RandomForestRegressorFunction] (all of these are for regression)
    'UniqueErrorsInput': 0,                             # Ignore this for now (used for Rashomon)
    'n_estimators': 100,                                # If using RandomForestRegressorFunction/RandomForestClassificationFunction, this is the number of trees
    'regularization': 0.01,                             # Ignore this for now (used for Rashomon)
    'RashomonThresholdType': "Adder",                   # Ignore this for now (used for Rashomon)
    'RashomonThreshold': 0.05,                          # Ignore this for now (used for Rashomon)
    'Type': 'Regression'}                               # Options: [Classification, Regression]

### Seed ###
StartTime = time.time()
random.seed(SimulationConfigInput["Seed"])
np.random.seed(SimulationConfigInput["Seed"])

### Store Results ###
ErrorVec = []
SelectedObservationHistory = []
TreeCount = {"AllTreeCount": [], "UniqueTreeCount": []}

### Train Test Candidate Split ###
df_Train, df_Test, df_Candidate = TrainTestCandidateSplit(df, SimulationConfigInput["TestProportion"], SimulationConfigInput["CandidateProportion"])

### Update SimulationConfig Arguments ###
SimulationConfigInput['df_Train'] = df_Train
SimulationConfigInput["df_Test"] = df_Test
SimulationConfigInput["df_Candidate"] = df_Candidate


# For Loop

The following is a loop that identifies parts of the active learning process. Look at each chunk and each section, and then go into the function in the directory.

In [None]:
### Learning Procedure ###
for i in range(len(df_Candidate)):

    ### Prediction Model ###
    print("Iteration: " + str(i))
    ModelType = globals().get(SimulationConfigInput["ModelType"], None)                        # Extracts the right model
    ModelArgsFiltered = FilterArguments(ModelType, SimulationConfigInput)                      # Selects the variables needed for the type of model
    Model = ModelType(**ModelArgsFiltered)                                                     # Inputs the right variables needed for the type of model
    SimulationConfigInput['Model'] = Model                                                     # Updates model

    ### Test Error ###
    TestErrorOutput = TestErrorFunction(InputModel = Model, df_Test = SimulationConfigInput["df_Test"], Type = SimulationConfigInput["Type"])
    if('TREEFARMS' in str(type(Model))):                                                         # If Rashomon
        CurrentError = TestErrorOutput["Error_Duplicate"]
    else: 
        CurrentError = TestErrorOutput["ErrorVal"]                                               # One output for non-Rashomon
    ErrorVec.append(CurrentError)

    ### Sampling Procedure ###
    SelectorType = globals().get(SimulationConfigInput["SelectorType"], None)                      # Extracts the right selector
    SelectorArgsFiltered = FilterArguments(SelectorType, SimulationConfigInput)                    # Selects the variables needed for the type of selector
    SelectorFuncOutput = SelectorType(**SelectorArgsFiltered)                                      # Inputs the right variavles needed for the type of selector
    QueryObservationIndex = SelectorFuncOutput["IndexRecommendation"]                              # Extracts the index of the recommended observation
    QueryObservation = SimulationConfigInput["df_Candidate"].loc[[QueryObservationIndex]]          # Extracts the entire observation using the recommended index
    SelectedObservationHistory.append(QueryObservationIndex)                                       # Appends to the selection history

    ### Update Train and Candidate Sets ###
    SimulationConfigInput["df_Train"] = pd.concat([SimulationConfigInput["df_Train"], QueryObservation])
    SimulationConfigInput["df_Candidate"] = SimulationConfigInput["df_Candidate"].drop(QueryObservationIndex) 

    ### Store Number of (Unique) Trees ###
    if('TREEFARMS' in str(type(Model))):
        TreeCount["AllTreeCount"].append(SelectorFuncOutput["AllTreeCount"])          # Store number of trees
        TreeCount["UniqueTreeCount"].append(SelectorFuncOutput["UniqueTreeCount"])    # Store number of unique/duplicate trees


In [None]:
plt.plot(ErrorVec)

# Local Simulation

The function OneIterationFunction will do all all of that above! The for loop was just written out so that you can see what is happenning internally and make the connection to the active learning procedure you've been reading in your papers. 

The only thing you have to change is the parameters in SimulationConfigInput.

## Linear Regression

The following will run the active learning procedure with linear regression as the model. It will run the procedure with four different types of selectors: passive/random querying, GSx, GSy, and iGS.

In [None]:
LinearRegressionActiveLearningResults = {}

### Random/Passive Learning

In [None]:
### Input ###
SimulationConfigInput = {
    "DataFileInput" : "BostonHousing",                  # Data File Input
    'Seed': 1,                                          # Seed
    'TestProportion': 0.2,                              # Test proportion
    'CandidateProportion': 0.8,                         # Candidate proportion
    'SelectorType': 'PassiveLearning',                      # Options: [GSxFunction, GSyFunction, iGSFunction, PassiveLearning] (all of these are for regression)
    'ModelType': 'RandomForestRegressorFunction',       # Options: [LinearRegressionFunction, RandomForestRegressorFunction] (all of these are for regression)
    'n_estimators': 100,                                # If using RandomForestRegressorFunction/RandomForestClassificationFunction, this is the number of trees
    'Type': 'Regression',                               # Options: [Classification, Regression]
    'UniqueErrorsInput': 0,                             # Ignore this for now (used for Rashomon)
    'regularization': 0.01,                             # Ignore this for now (used for Rashomon)
    'RashomonThresholdType': "Adder",                   # Ignore this for now (used for Rashomon)
    'RashomonThreshold': 0.05}                          # Ignore this for now (used for Rashomon)

### Run Function
LinearRegressionResults_Passive = OneIterationFunction(SimulationConfigInput)
LinearRegressionActiveLearningResults["Passive"] = LinearRegressionResults_Passive


### Selector: GSx

In [None]:
### Input ###
SimulationConfigInput = {
    "DataFileInput" : "BostonHousing",                  # Data File Input
    'Seed': 1,                                          # Seed
    'TestProportion': 0.2,                              # Test proportion
    'CandidateProportion': 0.8,                         # Candidate proportion
    'SelectorType': 'GSxFunction',                      # Options: [GSxFunction, GSyFunction, iGSFunction, PassiveLearning] (all of these are for regression)
    'ModelType': 'LinearRegressionFunction',       # Options: [LinearRegressionFunction, RandomForestRegressorFunction] (all of these are for regression)
    'n_estimators': 100,                                # If using RandomForestRegressorFunction/RandomForestClassificationFunction, this is the number of trees
    'Type': 'Regression',                               # Options: [Classification, Regression]
    'UniqueErrorsInput': 0,                             # Ignore this for now (used for Rashomon)
    'regularization': 0.01,                             # Ignore this for now (used for Rashomon)
    'RashomonThresholdType': "Adder",                   # Ignore this for now (used for Rashomon)
    'RashomonThreshold': 0.05}                          # Ignore this for now (used for Rashomon)

### Run Function
LinearRegressionResults_GSx = OneIterationFunction(SimulationConfigInput)
LinearRegressionActiveLearningResults["GSx"] = LinearRegressionResults_GSx

### Selector: GSy

In [None]:
### Input ###
SimulationConfigInput = {
    "DataFileInput" : "BostonHousing",                  # Data File Input
    'Seed': 1,                                          # Seed
    'TestProportion': 0.2,                              # Test proportion
    'CandidateProportion': 0.8,                         # Candidate proportion
    'SelectorType': 'GSyFunction',                      # Options: [GSxFunction, GSyFunction, iGSFunction, PassiveLearning] (all of these are for regression)
    'ModelType': 'LinearRegressionFunction',       # Options: [LinearRegressionFunction, RandomForestRegressorFunction] (all of these are for regression)
    'n_estimators': 100,                                # If using RandomForestRegressorFunction/RandomForestClassificationFunction, this is the number of trees
    'Type': 'Regression',                               # Options: [Classification, Regression]
    'UniqueErrorsInput': 0,                             # Ignore this for now (used for Rashomon)
    'regularization': 0.01,                             # Ignore this for now (used for Rashomon)
    'RashomonThresholdType': "Adder",                   # Ignore this for now (used for Rashomon)
    'RashomonThreshold': 0.05}                          # Ignore this for now (used for Rashomon)

### Run Function
LinearRegressionResults_GSy = OneIterationFunction(SimulationConfigInput)
LinearRegressionActiveLearningResults["GSy"] = LinearRegressionResults_GSy


### Selector: iGS

In [None]:
### Input ###
SimulationConfigInput = {
    "DataFileInput" : "BostonHousing",                  # Data File Input
    'Seed': 1,                                          # Seed
    'TestProportion': 0.2,                              # Test proportion
    'CandidateProportion': 0.8,                         # Candidate proportion
    'SelectorType': 'iGSFunction',                      # Options: [GSxFunction, GSyFunction, iGSFunction, PassiveLearning] (all of these are for regression)
    'ModelType': 'LinearRegressionFunction',       # Options: [LinearRegressionFunction, RandomForestRegressorFunction] (all of these are for regression)
    'n_estimators': 100,                                # If using RandomForestRegressorFunction/RandomForestClassificationFunction, this is the number of trees
    'Type': 'Regression',                               # Options: [Classification, Regression]
    'UniqueErrorsInput': 0,                             # Ignore this for now (used for Rashomon)
    'regularization': 0.01,                             # Ignore this for now (used for Rashomon)
    'RashomonThresholdType': "Adder",                   # Ignore this for now (used for Rashomon)
    'RashomonThreshold': 0.05}                          # Ignore this for now (used for Rashomon)

### Run Function
LinearRegressionResults_iGS = OneIterationFunction(SimulationConfigInput)
LinearRegressionActiveLearningResults["iGS"] = LinearRegressionResults_iGS

### Linear Regression Active Learning Results

In [None]:
### Set Up ###
SimulationErrorResults = {"Passive" : LinearRegressionActiveLearningResults["Passive"]["ErrorVec"],
                          "GSx" : LinearRegressionActiveLearningResults["GSx"]["ErrorVec"],
                          "GSy" : LinearRegressionActiveLearningResults["GSy"]["ErrorVec"],
                          "iGS" : LinearRegressionActiveLearningResults["iGS"]["ErrorVec"],}

### Aesthetics ###
# PlotSubtitle = f"Dataset: {SimulationConfigInput["DataFileInput"]}"
PlotSubtitle="BostonHousing with Linear Regression"
Colors = {
    "Passive": "black",
    "GSx": "green",
    "GSy": "orange",
    "iGS": "blue"
}

Linestyles = {
    "Passive": "solid",
    "GSx": "solid",
    "GSy": "solid",
    "iGS": "solid"
}

LegendMapping = {
    "Passive": "Passive",
    "GSx": "GSx",
    "GSy": "GSy",
    "iGS": "iGS"
}


In [None]:
### Mean Plot ###
plt.figure(figsize=[10,5])
for Label, Values in SimulationErrorResults.items():
    x = 20 + (np.arange(len(Values)) / len(Values)) * 80  # Start at 20% and go to 100%
    color = Colors.get(Label, None) if Colors else None 
    linestyle = Linestyles.get(Label, ':') if Linestyles else ':'
    legend_label = LegendMapping[Label] if LegendMapping and Label in LegendMapping else Label
    plt.plot(x, Values, label=legend_label, color=color, linestyle=linestyle)

plt.suptitle("Active Learning Mean Error Plot")
plt.xlabel("Percent of labelled observations")
plt.ylabel("RMSE")
plt.title(PlotSubtitle, fontsize=9)
plt.legend(loc='upper right')
MeanPlot = plt.gcf()

---

## Random Forests

The following will run the active learning procedure with linear regression as the model. It will run the procedure with four different types of selectors: passive/random querying, GSx, GSy, and iGS.

In [None]:
RandomForestsActiveLearningResults = {}

### Random/Passive Learning

In [None]:
### Input ###
SimulationConfigInput = {
    "DataFileInput" : "BostonHousing",                  # Data File Input
    'Seed': 1,                                          # Seed
    'TestProportion': 0.2,                              # Test proportion
    'CandidateProportion': 0.8,                         # Candidate proportion
    'SelectorType': 'PassiveLearning',                      # Options: [GSxFunction, GSyFunction, iGSFunction, PassiveLearning] (all of these are for regression)
    'ModelType': 'RandomForestRegressorFunction',       # Options: [LinearRegressionFunction, RandomForestRegressorFunction] (all of these are for regression)
    'n_estimators': 100,                                # If using RandomForestRegressorFunction/RandomForestClassificationFunction, this is the number of trees
    'Type': 'Regression',                               # Options: [Classification, Regression]
    'UniqueErrorsInput': 0,                             # Ignore this for now (used for Rashomon)
    'regularization': 0.01,                             # Ignore this for now (used for Rashomon)
    'RashomonThresholdType': "Adder",                   # Ignore this for now (used for Rashomon)
    'RashomonThreshold': 0.05}                          # Ignore this for now (used for Rashomon)

### Run Function
RandomForestsResults_Passive = OneIterationFunction(SimulationConfigInput)
RandomForestsActiveLearningResults["Passive"] = RandomForestsResults_Passive


### Selector: GSx

In [None]:
### Input ###
SimulationConfigInput = {
    "DataFileInput" : "BostonHousing",                  # Data File Input
    'Seed': 1,                                          # Seed
    'TestProportion': 0.2,                              # Test proportion
    'CandidateProportion': 0.8,                         # Candidate proportion
    'SelectorType': 'GSxFunction',                      # Options: [GSxFunction, GSyFunction, iGSFunction, PassiveLearning] (all of these are for regression)
    'ModelType': 'RandomForestRegressorFunction',       # Options: [LinearRegressionFunction, RandomForestRegressorFunction] (all of these are for regression)
    'n_estimators': 100,                                # If using RandomForestRegressorFunction/RandomForestClassificationFunction, this is the number of trees
    'Type': 'Regression',                               # Options: [Classification, Regression]
    'UniqueErrorsInput': 0,                             # Ignore this for now (used for Rashomon)
    'regularization': 0.01,                             # Ignore this for now (used for Rashomon)
    'RashomonThresholdType': "Adder",                   # Ignore this for now (used for Rashomon)
    'RashomonThreshold': 0.05}                          # Ignore this for now (used for Rashomon)

### Run Function
RandomForestsResults_GSx = OneIterationFunction(SimulationConfigInput)
RandomForestsActiveLearningResults["GSx"] = RandomForestsResults_GSx

### Selector: GSy

In [None]:
### Input ###
SimulationConfigInput = {
    "DataFileInput" : "BostonHousing",                  # Data File Input
    'Seed': 1,                                          # Seed
    'TestProportion': 0.2,                              # Test proportion
    'CandidateProportion': 0.8,                         # Candidate proportion
    'SelectorType': 'GSyFunction',                      # Options: [GSxFunction, GSyFunction, iGSFunction, PassiveLearning] (all of these are for regression)
    'ModelType': 'RandomForestRegressorFunction',       # Options: [LinearRegressionFunction, RandomForestRegressorFunction] (all of these are for regression)
    'n_estimators': 100,                                # If using RandomForestRegressorFunction/RandomForestClassificationFunction, this is the number of trees
    'Type': 'Regression',                               # Options: [Classification, Regression]
    'UniqueErrorsInput': 0,                             # Ignore this for now (used for Rashomon)
    'regularization': 0.01,                             # Ignore this for now (used for Rashomon)
    'RashomonThresholdType': "Adder",                   # Ignore this for now (used for Rashomon)
    'RashomonThreshold': 0.05}                          # Ignore this for now (used for Rashomon)

### Run Function
RandomForestsResults_GSy = OneIterationFunction(SimulationConfigInput)
RandomForestsActiveLearningResults["GSy"] = RandomForestsResults_GSy


### Selector: iGS

In [None]:
### Input ###
SimulationConfigInput = {
    "DataFileInput" : "BostonHousing",                  # Data File Input
    'Seed': 1,                                          # Seed
    'TestProportion': 0.2,                              # Test proportion
    'CandidateProportion': 0.8,                         # Candidate proportion
    'SelectorType': 'iGSFunction',                      # Options: [GSxFunction, GSyFunction, iGSFunction, PassiveLearning] (all of these are for regression)
    'ModelType': 'RandomForestRegressorFunction',       # Options: [LinearRegressionFunction, RandomForestRegressorFunction] (all of these are for regression)
    'n_estimators': 100,                                # If using RandomForestRegressorFunction/RandomForestClassificationFunction, this is the number of trees
    'Type': 'Regression',                               # Options: [Classification, Regression]
    'UniqueErrorsInput': 0,                             # Ignore this for now (used for Rashomon)
    'regularization': 0.01,                             # Ignore this for now (used for Rashomon)
    'RashomonThresholdType': "Adder",                   # Ignore this for now (used for Rashomon)
    'RashomonThreshold': 0.05}                          # Ignore this for now (used for Rashomon)

### Run Function
RandomForestsResults_iGS = OneIterationFunction(SimulationConfigInput)
RandomForestsActiveLearningResults["iGS"] = RandomForestsResults_iGS

### Linear Regression Active Learning Results

In [None]:
### Set Up ###
SimulationErrorResults = {"Passive" : RandomForestsActiveLearningResults["Passive"]["ErrorVec"],
                          "GSx" : RandomForestsActiveLearningResults["GSx"]["ErrorVec"],
                          "GSy" : RandomForestsActiveLearningResults["GSy"]["ErrorVec"],
                          "iGS" : RandomForestsActiveLearningResults["iGS"]["ErrorVec"],}

### Aesthetics ###
# PlotSubtitle = f"Dataset: {SimulationConfigInput["DataFileInput"]}"
PlotSubtitle= "BostonHousing with RandomForests"
Colors = {
    "Passive": "black",
    "GSx": "green",
    "GSy": "orange",
    "iGS": "blue"
}

Linestyles = {
    "Passive": "solid",
    "GSx": "solid",
    "GSy": "solid",
    "iGS": "solid"
}

LegendMapping = {
    "Passive": "Passive",
    "GSx": "GSx",
    "GSy": "GSy",
    "iGS": "iGS"
}


In [None]:
### Mean Plot ###
plt.figure(figsize=[10,5])
for Label, Values in SimulationErrorResults.items():
    x = 20 + (np.arange(len(Values)) / len(Values)) * 80  # Start at 20% and go to 100%
    color = Colors.get(Label, None) if Colors else None 
    linestyle = Linestyles.get(Label, ':') if Linestyles else ':'
    legend_label = LegendMapping[Label] if LegendMapping and Label in LegendMapping else Label
    plt.plot(x, Values, label=legend_label, color=color, linestyle=linestyle)

plt.suptitle("Active Learning Mean Error Plot")
plt.xlabel("Percent of labelled observations")
plt.ylabel("RMSE")
plt.title(PlotSubtitle, fontsize=9)
plt.legend(loc='upper right')
MeanPlot = plt.gcf()

# Tasks

## 1. Multiple Simulations

Ordinadily, in order to compare methods we want a wide range of simulations. In the above work, we only ran 1 simulation. Instead, we want to run 100 simulations each with diferent seeds for eah method, average the methods, then compare their results. I will walk you through this. 

1. Firstly, initialize a list for the four selector methods methods (one for each mehod: GSxFunction, GSyFunction, iGSFunction, PassiveLearning) to store your results.
2. Within a for loop between 0 and 99, 
    - Set the seed to the current number of the looop
    - Construct 4 SimulationConfigInput
    - Run the four iteration functions for each method.
        - OneIterationFunction(SimulationConfigInput_Passive)
        - OneIterationFunction(SimulationConfigInput_GSx)
        - OneIterationFunction(SimulationConfigInput_GSy)
        - OneIterationFunction(SimulationConfigInput_iGS)
    - Append each of the four methods to their respective results.
3. Average the error results within the four methods.
4. Create an active learning plot showing the time by number of labelled observations.
5. Report the average run time of each selector method.

Note this will take **a very long time!** I suspect each loop to take 2 minutes. You are essentially running 4 active learning processes 100 times (with each active learning process takeung about 30 seconds). I would recommend letting this run overnight. Next week, I will show you how can you run this on the university High-Performance Computing Cluster.


In [None]:


# Initialize lists to store simulation results for each selector method.
results_passive = []
results_GSx = []
results_GSy = []
results_iGS = []

# Run 100 simulations.
n_simulations = 100

for seed in range(n_simulations):
    # Base configuration shared across methods.
    base_config = {
        "DataFileInput": "BostonHousing",
        "Seed": seed,
        "TestProportion": 0.2,
        "CandidateProportion": 0.8,
        "ModelType": "LinearRegressionFunction",  # or "RandomForestRegressorFunction" if desired
        "n_estimators": 100,
        "Type": "Regression",
        "UniqueErrorsInput": 0,
        "regularization": 0.01,
        "RashomonThresholdType": "Adder",
        "RashomonThreshold": 0.05,
    }
    
    # Create separate configuration dictionaries for each selector.
    config_passive = base_config.copy()
    config_passive["SelectorType"] = "PassiveLearning"
    
    config_GSx = base_config.copy()
    config_GSx["SelectorType"] = "GSxFunction"
    
    config_GSy = base_config.copy()
    config_GSy["SelectorType"] = "GSyFunction"
    
    config_iGS = base_config.copy()
    config_iGS["SelectorType"] = "iGSFunction"
    
    # Run the one-iteration active learning process for each configuration.
    result_passive = OneIterationFunction(config_passive)
    result_GSx = OneIterationFunction(config_GSx)
    result_GSy = OneIterationFunction(config_GSy)
    result_iGS = OneIterationFunction(config_iGS)
    
    # Append the results.
    results_passive.append(result_passive)
    results_GSx.append(result_GSx)
    results_GSy.append(result_GSy)
    results_iGS.append(result_iGS)

# Helper function to average the error curves.
def average_error(results):
    # Each result's "ErrorVec" is assumed to be a DataFrame with a column "Error".
    error_arrays = [result["ErrorVec"]["Error"].values for result in results]
    error_arrays = np.array(error_arrays)
    return np.mean(error_arrays, axis=0)

# Calculate the average error for each selector method.
avg_error_passive = average_error(results_passive)
avg_error_GSx = average_error(results_GSx)
avg_error_GSy = average_error(results_GSy)
avg_error_iGS = average_error(results_iGS)

# Helper function to average the runtime.
def average_runtime(results):
    runtimes = [result["ElapsedTime"] for result in results]
    return np.mean(runtimes)

# Calculate the average runtime for each method.
avg_time_passive = average_runtime(results_passive)
avg_time_GSx = average_runtime(results_GSx)
avg_time_GSy = average_runtime(results_GSy)
avg_time_iGS = average_runtime(results_iGS)

print("Average runtime (seconds):")
print("Passive:", avg_time_passive)
print("GSx:", avg_time_GSx)
print("GSy:", avg_time_GSy)
print("iGS:", avg_time_iGS)

# Plot the average error curves.
# Here we assume that the x-axis represents the percentage of labelled observations,
# starting at 20% (TestProportion) and ending at 100%.
num_iterations = len(avg_error_passive)
x_values = np.linspace(20, 100, num_iterations)

plt.figure(figsize=(10, 5))
plt.plot(x_values, avg_error_passive, label="Passive", color="black")
plt.plot(x_values, avg_error_GSx, label="GSx", color="green")
plt.plot(x_values, avg_error_GSy, label="GSy", color="orange")
plt.plot(x_values, avg_error_iGS, label="iGS", color="blue")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("Active Learning Mean Error Plot")
plt.legend(loc='upper right')
plt.show()


## 2. How to change things

To add your own selector techinque or model technique, go to the respective directory (~/utils.Prediction for prediction models or ~/utils.Selector for the selection method) and add your own function! Then in SimulationConfigInput, change the SelectorType or ModelType input to the names of your functions. Try adding a new function (for instance, maybe a support vector machine) to the methods section and try running it!

Additionally, you (Troy) had good ideas for a new selector method. Try to see if you can implement it. Make sure the output is the same as the models/selector methods listed there.

Run the code below to see if it's working, and compare the results to what we've had.

One Iteration of everything

In [None]:
# Base configuration shared across methods. 
base_config = {
    "DataFileInput": "BostonHousing",
    "Seed": 1,
    "TestProportion": 0.2,
    "CandidateProportion": 0.8,
    "ModelType": "LinearRegressionFunction",  # or "RandomForestRegressorFunction" if desired
    "n_estimators": 100,
    "Type": "Regression",
    "UniqueErrorsInput": 0,
    "regularization": 0.01,
    "RashomonThresholdType": "Adder",
    "RashomonThreshold": 0.05,
}

# Create separate configuration dictionaries for each selector.
config_passive    = base_config.copy(); config_passive["SelectorType"]    = "PassiveLearning"
config_GSx        = base_config.copy(); config_GSx["SelectorType"]        = "GSxFunction"              # original GSx (min distance)
config_GSxAvg     = base_config.copy(); config_GSxAvg["SelectorType"]     = "GSxFunctionAverage"       # average GSx
config_GSy        = base_config.copy(); config_GSy["SelectorType"]        = "GSyFunction"              # original GSy (min distance)
config_GSyAvg     = base_config.copy(); config_GSyAvg["SelectorType"]     = "GSyFunctionAverage"       # average GSy
config_iGS        = base_config.copy(); config_iGS["SelectorType"]        = "iGSFunction"              # original iGS (min distances multiplied)
config_iGSAvg     = base_config.copy(); config_iGSAvg["SelectorType"]     = "iGSFunctionAverage"       # average iGS (multiplication)
config_iGSStd     = base_config.copy(); config_iGSStd["SelectorType"]     = "iGSFunctionStandardized"  # standardized iGS (min distances)
config_iGSAvgStd  = base_config.copy(); config_iGSAvgStd["SelectorType"]  = "iGSFunctionAverageStandardized"  # standardized iGS (average distances)

# =============================================================================
# Run the one-iteration active learning process for each configuration.
# (Note: OneIterationFunction must be defined elsewhere in your project)

result_passive   = OneIterationFunction(config_passive)
result_GSx       = OneIterationFunction(config_GSx)
result_GSxAvg    = OneIterationFunction(config_GSxAvg)
result_GSy       = OneIterationFunction(config_GSy)
result_GSyAvg    = OneIterationFunction(config_GSyAvg)
result_iGS       = OneIterationFunction(config_iGS)
result_iGSAvg    = OneIterationFunction(config_iGSAvg)
result_iGSStd    = OneIterationFunction(config_iGSStd)
result_iGSAvgStd = OneIterationFunction(config_iGSAvgStd)

# Append the results to corresponding lists.
results_passive   = []
results_GSx       = []
results_GSxAvg    = []
results_GSy       = []
results_GSyAvg    = []
results_iGS       = []
results_iGSAvg    = []
results_iGSStd    = []
results_iGSAvgStd = []

results_passive.append(result_passive)
results_GSx.append(result_GSx)
results_GSxAvg.append(result_GSxAvg)
results_GSy.append(result_GSy)
results_GSyAvg.append(result_GSyAvg)
results_iGS.append(result_iGS)
results_iGSAvg.append(result_iGSAvg)
results_iGSStd.append(result_iGSStd)
results_iGSAvgStd.append(result_iGSAvgStd)

# =============================================================================
# Helper function to compute the average error.
def average_error(results):
    # Each result's "ErrorVec" is assumed to be a DataFrame with a column "Error".
    error_arrays = [result["ErrorVec"]["Error"].values for result in results]
    error_arrays = np.array(error_arrays)
    return np.mean(error_arrays, axis=0)

# Calculate the average error for each selector method.
avg_error_passive   = average_error(results_passive)
avg_error_GSx       = average_error(results_GSx)
avg_error_GSxAvg    = average_error(results_GSxAvg)
avg_error_GSy       = average_error(results_GSy)
avg_error_GSyAvg    = average_error(results_GSyAvg)
avg_error_iGS       = average_error(results_iGS)
avg_error_iGSAvg    = average_error(results_iGSAvg)
avg_error_iGSStd    = average_error(results_iGSStd)
avg_error_iGSAvgStd = average_error(results_iGSAvgStd)

# =============================================================================
# Helper function to average the runtime.
def average_runtime(results):
    runtimes = [result["ElapsedTime"] for result in results]
    return np.mean(runtimes)

# Calculate the average runtime for each method.
avg_time_passive   = average_runtime(results_passive)
avg_time_GSx       = average_runtime(results_GSx)
avg_time_GSxAvg    = average_runtime(results_GSxAvg)
avg_time_GSy       = average_runtime(results_GSy)
avg_time_GSyAvg    = average_runtime(results_GSyAvg)
avg_time_iGS       = average_runtime(results_iGS)
avg_time_iGSAvg    = average_runtime(results_iGSAvg)
avg_time_iGSStd    = average_runtime(results_iGSStd)
avg_time_iGSAvgStd = average_runtime(results_iGSAvgStd)

print("Average runtime (seconds):")
print("Passive:                ", avg_time_passive)
print("GSx (original):         ", avg_time_GSx)
print("GSx (average):          ", avg_time_GSxAvg)
print("GSy (original):         ", avg_time_GSy)
print("GSy (average):          ", avg_time_GSyAvg)
print("iGS (original):         ", avg_time_iGS)
print("iGS (average):          ", avg_time_iGSAvg)
print("iGS (standardized min): ", avg_time_iGSStd)
print("iGS (average standardized):", avg_time_iGSAvgStd)

# =============================================================================
# Plot the average error curves.
# We assume the x-axis represents the percentage of labelled observations,
# starting at 20% (TestProportion) and ending at 100%.
num_iterations = len(avg_error_passive)
x_values = np.linspace(20, 100, num_iterations)

plt.figure(figsize=(10, 5))
plt.plot(x_values, avg_error_passive,   label="Passive",                 color="black")
plt.plot(x_values, avg_error_GSx,       label="GSx (original)",          color="green")
plt.plot(x_values, avg_error_GSxAvg,    label="GSx (average)",           color="lime")
plt.plot(x_values, avg_error_GSy,       label="GSy (original)",          color="orange")
plt.plot(x_values, avg_error_GSyAvg,    label="GSy (average)",           color="gold")
plt.plot(x_values, avg_error_iGS,       label="iGS (original)",          color="blue")
plt.plot(x_values, avg_error_iGSAvg,    label="iGS (average)",           color="cyan")
plt.plot(x_values, avg_error_iGSStd,    label="iGS (standardized min)",  color="red")
plt.plot(x_values, avg_error_iGSAvgStd, label="iGS (average standardized)", color="purple")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("Active Learning Mean Error Plot")
plt.legend(loc='upper right')
plt.show()


In [None]:


# Plot for GSx (original vs average)
plt.figure(figsize=(8,6))
plt.plot(x_values, avg_error_GSx, label="GSx (original)", color="green")
plt.plot(x_values, avg_error_GSxAvg, label="GSx (average)", color="lime")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("GSx vs GSx (average)")
plt.legend()
plt.show()

# Plot for GSy (original vs average)
plt.figure(figsize=(8,6))
plt.plot(x_values, avg_error_GSy, label="GSy (original)", color="orange")
plt.plot(x_values, avg_error_GSyAvg, label="GSy (average)", color="gold")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("GSy vs GSy (average)")
plt.legend()
plt.show()

# Plot for iGS (original vs average)
plt.figure(figsize=(8,6))
plt.plot(x_values, avg_error_iGS, label="iGS (original)", color="blue")
plt.plot(x_values, avg_error_iGSAvg, label="iGS (average)", color="cyan")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("iGS vs iGS (average)")
plt.legend()
plt.show()

# Plot for iGS variants (original vs standardized min vs average standardized)
plt.figure(figsize=(8,6))
plt.plot(x_values, avg_error_iGS, label="iGS (original)", color="blue")
plt.plot(x_values, avg_error_iGSStd, label="iGS (standardized min)", color="red")
plt.plot(x_values, avg_error_iGSAvgStd, label="iGS (average standardized)", color="purple")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("iGS vs Standardized Variants")
plt.legend()
plt.show()


In [None]:
# Initialize lists to store simulation results for each selector method.
results_passive   = []
results_GSx       = []
results_GSxAvg    = []
results_GSy       = []
results_GSyAvg    = []
results_iGS       = []
results_iGSAvg    = []
results_iGSStd    = []
results_iGSAvgStd = []

# Run 100 simulations.
n_simulations = 100

for seed in range(n_simulations):
    # Base configuration shared across methods. 
    base_config = {
        "DataFileInput": "BostonHousing",
        "Seed": seed,
        "TestProportion": 0.2,
        "CandidateProportion": 0.8,
        "ModelType": "LinearRegressionFunction",  # or "RandomForestRegressorFunction" if desired
        "n_estimators": 100,
        "Type": "Regression",
        "UniqueErrorsInput": 0,
        "regularization": 0.01,
        "RashomonThresholdType": "Adder",
        "RashomonThreshold": 0.05,
    }

    # Create separate configuration dictionaries for each selector.
    config_passive    = base_config.copy(); config_passive["SelectorType"]    = "PassiveLearning"
    config_GSx        = base_config.copy(); config_GSx["SelectorType"]        = "GSxFunction"              # original GSx (min distance)
    config_GSxAvg     = base_config.copy(); config_GSxAvg["SelectorType"]     = "GSxFunctionAverage"       # average GSx
    config_GSy        = base_config.copy(); config_GSy["SelectorType"]        = "GSyFunction"              # original GSy (min distance)
    config_GSyAvg     = base_config.copy(); config_GSyAvg["SelectorType"]     = "GSyFunctionAverage"       # average GSy
    config_iGS        = base_config.copy(); config_iGS["SelectorType"]        = "iGSFunction"              # original iGS (min distances multiplied)
    config_iGSAvg     = base_config.copy(); config_iGSAvg["SelectorType"]     = "iGSFunctionAverage"       # average iGS (multiplication)
    config_iGSStd     = base_config.copy(); config_iGSStd["SelectorType"]     = "iGSFunctionStandardized"  # standardized iGS (min distances)
    config_iGSAvgStd  = base_config.copy(); config_iGSAvgStd["SelectorType"]  = "iGSFunctionAverageStandardized"  # standardized iGS (average distances)

    # =============================================================================
    # Run the one-iteration active learning process for each configuration.
    # (Note: OneIterationFunction must be defined elsewhere in your project)

    result_passive   = OneIterationFunction(config_passive)
    result_GSx       = OneIterationFunction(config_GSx)
    result_GSxAvg    = OneIterationFunction(config_GSxAvg)
    result_GSy       = OneIterationFunction(config_GSy)
    result_GSyAvg    = OneIterationFunction(config_GSyAvg)
    result_iGS       = OneIterationFunction(config_iGS)
    result_iGSAvg    = OneIterationFunction(config_iGSAvg)
    result_iGSStd    = OneIterationFunction(config_iGSStd)
    result_iGSAvgStd = OneIterationFunction(config_iGSAvgStd)

    results_passive.append(result_passive)
    results_GSx.append(result_GSx)
    results_GSxAvg.append(result_GSxAvg)
    results_GSy.append(result_GSy)
    results_GSyAvg.append(result_GSyAvg)
    results_iGS.append(result_iGS)
    results_iGSAvg.append(result_iGSAvg)
    results_iGSStd.append(result_iGSStd)
    results_iGSAvgStd.append(result_iGSAvgStd)

In [None]:

# =============================================================================
# Helper function to compute the average error.
def average_error(results):
    # Each result's "ErrorVec" is assumed to be a DataFrame with a column "Error".
    error_arrays = [result["ErrorVec"]["Error"].values for result in results]
    error_arrays = np.array(error_arrays)
    return np.mean(error_arrays, axis=0)

# Calculate the average error for each selector method.
avg_error_passive   = average_error(results_passive)
avg_error_GSx       = average_error(results_GSx)
avg_error_GSxAvg    = average_error(results_GSxAvg)
avg_error_GSy       = average_error(results_GSy)
avg_error_GSyAvg    = average_error(results_GSyAvg)
avg_error_iGS       = average_error(results_iGS)
avg_error_iGSAvg    = average_error(results_iGSAvg)
avg_error_iGSStd    = average_error(results_iGSStd)
avg_error_iGSAvgStd = average_error(results_iGSAvgStd)

# =============================================================================
# Helper function to average the runtime.
def average_runtime(results):
    runtimes = [result["ElapsedTime"] for result in results]
    return np.mean(runtimes)

# Calculate the average runtime for each method.
avg_time_passive   = average_runtime(results_passive)
avg_time_GSx       = average_runtime(results_GSx)
avg_time_GSxAvg    = average_runtime(results_GSxAvg)
avg_time_GSy       = average_runtime(results_GSy)
avg_time_GSyAvg    = average_runtime(results_GSyAvg)
avg_time_iGS       = average_runtime(results_iGS)
avg_time_iGSAvg    = average_runtime(results_iGSAvg)
avg_time_iGSStd    = average_runtime(results_iGSStd)
avg_time_iGSAvgStd = average_runtime(results_iGSAvgStd)

print("Average runtime (seconds):")
print("Passive:                ", avg_time_passive)
print("GSx (original):         ", avg_time_GSx)
print("GSx (average):          ", avg_time_GSxAvg)
print("GSy (original):         ", avg_time_GSy)
print("GSy (average):          ", avg_time_GSyAvg)
print("iGS (original):         ", avg_time_iGS)
print("iGS (average):          ", avg_time_iGSAvg)
print("iGS (standardized min): ", avg_time_iGSStd)
print("iGS (average standardized):", avg_time_iGSAvgStd)

# =============================================================================
# Plot the average error curves.
# We assume the x-axis represents the percentage of labelled observations,
# starting at 20% (TestProportion) and ending at 100%.
num_iterations = len(avg_error_passive)
x_values = np.linspace(20, 100, num_iterations)

plt.figure(figsize=(10, 5))
plt.plot(x_values, avg_error_passive,   label="Passive",                 color="black")
plt.plot(x_values, avg_error_GSx,       label="GSx (original)",          color="green")
plt.plot(x_values, avg_error_GSxAvg,    label="GSx (average)",           color="lime")
plt.plot(x_values, avg_error_GSy,       label="GSy (original)",          color="orange")
plt.plot(x_values, avg_error_GSyAvg,    label="GSy (average)",           color="gold")
plt.plot(x_values, avg_error_iGS,       label="iGS (original)",          color="blue")
plt.plot(x_values, avg_error_iGSAvg,    label="iGS (average)",           color="cyan")
plt.plot(x_values, avg_error_iGSStd,    label="iGS (standardized min)",  color="red")
plt.plot(x_values, avg_error_iGSAvgStd, label="iGS (average standardized)", color="purple")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("Active Learning Mean Error Plot")
plt.legend(loc='upper right')
plt.show()


# Plot for GSx (original vs average)
plt.figure(figsize=(8,6))
plt.plot(x_values, avg_error_GSx, label="GSx (original)", color="green")
plt.plot(x_values, avg_error_GSxAvg, label="GSx (average)", color="lime")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("GSx vs GSx (average)")
plt.legend()
plt.show()

# Plot for GSy (original vs average)
plt.figure(figsize=(8,6))
plt.plot(x_values, avg_error_GSy, label="GSy (original)", color="orange")
plt.plot(x_values, avg_error_GSyAvg, label="GSy (average)", color="gold")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("GSy vs GSy (average)")
plt.legend()
plt.show()

# Plot for iGS (original vs average)
plt.figure(figsize=(8,6))
plt.plot(x_values, avg_error_iGS, label="iGS (original)", color="blue")
plt.plot(x_values, avg_error_iGSAvg, label="iGS (average)", color="cyan")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("iGS vs iGS (average)")
plt.legend()
plt.show()

# Plot for iGS variants (original vs standardized min vs average standardized)
plt.figure(figsize=(8,6))
plt.plot(x_values, avg_error_iGS, label="iGS (original)", color="blue")
plt.plot(x_values, avg_error_iGSStd, label="iGS (standardized min)", color="red")
plt.plot(x_values, avg_error_iGSAvgStd, label="iGS (average standardized)", color="purple")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("iGS vs Standardized Variants")
plt.legend()
plt.show()

# Plot for iGS variants (original vs standardized min vs average standardized)
plt.figure(figsize=(8,6))
plt.plot(x_values, avg_error_iGSAvg, label="iGS (original)", color="cyan")
plt.plot(x_values, avg_error_iGSStd, label="iGS (standardized min)", color="red")
plt.plot(x_values, avg_error_iGSAvgStd, label="iGS (average standardized)", color="purple")
plt.xlabel("Percent of Labelled Observations")
plt.ylabel("RMSE")
plt.title("iGSAvg vs Standardized Variants")
plt.legend()
plt.show()




In [None]:
import numpy as np
import pandas as pd
from scipy.stats import wilcoxon

# Assuming that result_passive, result_GSx, etc. are dictionaries returned from OneIterationFunction
SimulationErrorResults = {
    'passive': np.array(result_passive["ErrorVec"]["Error"].values),
    'GSx': np.array(result_GSx["ErrorVec"]["Error"].values),
    'GSxAvg': np.array(result_GSxAvg["ErrorVec"]["Error"].values),
    'GSy': np.array(result_GSy["ErrorVec"]["Error"].values),
    'GSyAvg': np.array(result_GSyAvg["ErrorVec"]["Error"].values),
    'iGS': np.array(result_iGS["ErrorVec"]["Error"].values),
    'iGSAvg': np.array(result_iGSAvg["ErrorVec"]["Error"].values),
    'iGSStd': np.array(result_iGSStd["ErrorVec"]["Error"].values),
    'iGSAvgStd': np.array(result_iGSAvgStd["ErrorVec"]["Error"].values)
}

wilcox_results = WilcoxonRankSignedTest(SimulationErrorResults)
print(wilcox_results)


ADDING MORE


In [None]:
from sklearn.datasets import load_diabetes, fetch_california_housing, fetch_openml
import pandas as pd
### Import Packages ###

import os
import math as math
import pandas as pd
import random as random

### Append Path ###
import sys
sys.path.append('..')

### Local Packages ###
from utils.Prediction import *
from utils.Selector import *
from utils.Auxiliary import *
from utils.Main import *

# 1. Diabetes Dataset (Regression)
diabetes = load_diabetes()
df_diabetes = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df_diabetes['target'] = diabetes.target

# 2. California Housing Dataset (Regression)
california = fetch_california_housing(as_frame=True)
df_california = california.frame  # Already a DataFrame, target in 'MedHouseVal'
df_california.rename(columns={"MedHouseVal": "target"}, inplace=True)

# 3. Boston Housing Dataset (Regression)
# Note: Boston Housing is deprecated for ethical reasons.
boston = fetch_openml(name="boston", version=1, as_frame=True)
df_boston = pd.DataFrame(boston.data, columns=boston.feature_names)
df_boston['target'] = boston.target

# 4. Ames Housing Dataset (Regression)
# Use the dataset ID to avoid URL issues with spaces in the name.
ames = fetch_openml(data_id=42165, as_frame=True)
df_ames = pd.DataFrame(ames.data)
df_ames['target'] = ames.target


# Combine into a dictionary of datasets
datasets = {
    "Diabetes": df_diabetes,
    "CaliforniaHousing": df_california,
    "BostonHousing": df_boston,
    "AmesHousing": df_ames,
}

# Inspect one of the datasets (e.g., Ames Housing)
#print(datasets["Concrete"].head())


In [None]:
import os
print(os.getcwd())



In [None]:
datasets = {
    "Diabetes": df_diabetes,
    # "CaliforniaHousing": df_california,
    # "BostonHousing": df_boston,
    # "AmesHousing": df_ames,
}

def drop_non_numeric_columns(df, label_col="Y"):
    """Drops all non-numeric columns (except your label col, if you want to keep it)."""
    # Separate the label from the features
    features_df = df.loc[:, df.columns != label_col]
    label_series = df[label_col] if label_col in df.columns else None

    # Keep only numeric columns
    numeric_features_df = features_df.select_dtypes(include=["number"]).copy()

    # Reattach the label if present
    if label_series is not None:
        numeric_features_df[label_col] = label_series
    
    return numeric_features_df

# Example usage:
for dataset_name, df in datasets.items():
    datasets[dataset_name] = drop_non_numeric_columns(df, label_col="Y")


for key, df in datasets.items():
    if "target" in df.columns:
        df.rename(columns={"target": "Y"}, inplace=True)
import os


for name, df in datasets.items():
    filename = os.path.join("dataset_files", f"{name}.pkl")
    df.to_pickle(filename)
    print(f"Saved {name} dataset as {filename}")



# 2. Define your model and selector lists
model_types = [
    "LinearRegressionFunction",
    "RandomForestRegressorFunction",
    "XGBoostRegressorFunction",
    "NeuralNetworkFunction"
]


selector_types = [
    "PassiveLearning",
    "GSxFunction",            
    "GSxFunctionAverage",      
    "GSyFunction",              
    "GSyFunctionAverage",     
    "iGSFunction",             
    "iGSFunctionAverage",     
    "iGSFunctionStandardized", 
    "iGSFunctionAverageStandardized"
]
    
# Import or define your OneIterationFunction and TrainTestCandidateSplit functions
from utils.Main import OneIterationFunction, TrainTestCandidateSplit

all_results = run_active_learning_experiments(
    datasets=datasets,
    model_types=model_types,
    selector_types=selector_types,
    OneIterationFunction=OneIterationFunction,
    TrainTestCandidateSplit=TrainTestCandidateSplit,
    n_simulations=2,          # Use 1 for a quick run; set 100+ for full experiments
    test_proportion=0.2,
    candidate_proportion=0.8,
    plot=False,
    do_wilcoxon=False,
    round_wilcoxon=4,
    output_dir="results"
)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

def manual_active_learning_plot(results_summary, dataset_name, model_name, candidate_proportion=0.8):
    """
    Manually plots the average error curves for all selectors
    from results_summary for a specific (dataset_name, model_name).

    Parameters
    ----------
    results_summary : dict
        The dictionary returned by run_active_learning_experiments,
        keyed by (dataset, model, selector).
    dataset_name : str
        The dataset key to plot (e.g. "BostonHousing").
    model_name : str
        The model key to plot (e.g. "LinearRegressionFunction").
    candidate_proportion : float
        Used to map the x-axis from initial training size up to 100%.

    Returns
    -------
    None (displays a matplotlib plot).
    """

    # 1) Gather all selectors for this dataset & model
    #    simulation_error_dict will map { selector_name -> avg_error_array }
    simulation_error_dict = {}
    
    for (ds, model, selector), info in results_summary.items():
        if ds == dataset_name and model == model_name:
            # info["avg_error"] is the average error curve (numpy array) 
            simulation_error_dict[selector] = info["avg_error"]
    
    # If no selectors found, just return
    if not simulation_error_dict:
        print(f"No results found for dataset={dataset_name}, model={model_name}")
        return

    # 2) Plot each selector’s average error
    plt.figure(figsize=(10, 5))

    for selector_name, error_array in simulation_error_dict.items():
        # M is how many AL iterations were performed
        M = len(error_array)
        
        # Map iteration indices (0..M-1) to [start_percent..100]
        # If candidate_proportion=0.8, then the training set eventually ends at 100% (all data).
        # The starting percentage is 100*(1 - candidate_proportion) = 20% by default.
        x_values = np.linspace(100*(1 - candidate_proportion), 100, M)

        plt.plot(x_values, error_array, label=selector_name)

    plt.title(f"Active Learning Mean Error Plot\n{dataset_name} with {model_name}")
    plt.xlabel("Percent of labelled observations")
    plt.ylabel("RMSE")
    plt.legend(loc='upper right')
    plt.show()


# EXAMPLE USAGE:
# Suppose you have the results_summary returned by run_active_learning_experiments.
# You can call:

manual_active_learning_plot(
    results_summary=all_results, 
    dataset_name="Diabetes",
    model_name="LinearRegressionFunction",
    candidate_proportion=0.8  # or whatever you used in run_active_learning_experiments
)
manual_active_learning_plot(
    results_summary=all_results, 
    dataset_name="Diabetes",
    model_name="RandomForestRegressorFunction",
    candidate_proportion=0.8  # or whatever you used in run_active_learning_experiments
)
manual_active_learning_plot(
    results_summary=all_results, 
    dataset_name="Diabetes",
    model_name="XGBoostRegressorFunction",
    candidate_proportion=0.8  # or whatever you used in run_active_learning_experiments
)
manual_active_learning_plot(
    results_summary=all_results, 
    dataset_name="Diabetes",
    model_name="NeuralNetworkFunction",
    candidate_proportion=0.8  # or whatever you used in run_active_learning_experiments
)

