In [None]:
# Import sklearn modules necessary for following sections
from sklearn.cluster import KMeans
import statistics
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score, LeaveOneOut, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
import numpy as np
import pandas as pd
import numpy as np

In [None]:
'''
Bootstrap data by resamping from dataset to a size of n_samples and adding Gaussian noise to the y data
Inputs: X_data = array of feature vectors, y_data = array of outputs, n_samples = Desired final data size, 
noise = Standard deviation of Gaussian distribution

Outputs: resampled list of feature vectors and resampled list of column densities 
'''


def resampling(X_data, y_data, n_samples, noise):

    X_resample, y_resample = resample(X_data, y_data, n_samples=n_samples) # Resample n samples from X and y data
    noise_array = np.random.normal(0., noise, size=y_resample.size) # Sample from Gaussian distribution with mean 0 and standard deviation = noise n times
    y_resample = y_resample + noise_array # Add Gaussian noise to y data
    return X_resample, y_resample

In [None]:
'''
Standardizes feature vectors so tha the individual features more
or less look like standard normally distributed data: Gaussian with zero mean and unit variance.
'''

def scaleData(X_train,X_test,X_tot):
    scaler = preprocessing.StandardScaler().fit(X_train) #creating Standard Scaler Object 
    X_train_scaled = scaler.transform(X_train) #Scaling X_train
    X_test_scaled = scaler.transform(X_test) #Scaling X_test
    X_tot_scaled = scaler.transform(X_tot)
    return X_train_scaled, X_test_scaled, X_tot_scaled

In [None]:

'''
Splits data 80/20 into train and test sets, bootstraps to a total of 800 samples and scales
the resulting feature vectors
'''


def splitBootScale(x,y,bootSize = 800,trainSize = 0.8):
    testSize = 1-trainSize
    X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = trainSize, test_size = testSize, random_state=85)
    print(len(X_train), len(X_test))
    bootTrainSize = round(bootSize*trainSize)
    print(bootTrainSize)
    bootTestSize = round(bootSize*testSize)
    print(bootTestSize)
    X_train_boot, y_train_boot = resampling(X_train, y_train, bootTrainSize, 0.5)
    X_test_boot, y_test_boot = resampling(X_test, y_test, bootTestSize, 0.5)
    X_train_bootScaled, X_test_bootScaled, x_scaled = scaleData(X_train_boot,X_test_boot,x)
    return X_train_bootScaled, X_test_bootScaled, y_train_boot, y_test_boot, x_scaled

In [None]:
'''
Three functions that are necessary to convert lists into strings and vice versa. For example,
converting the list [1,2,3] to the string "[1,2,3]". This is necessary in order to legibly store long
vectors in a .csv final using the Pandas module. 

'''


def str2float(string):
    split = list(string.split(','))
    floats_split = []
    for i in range(len(split)):
        floats = float(split[i])
        floats_split.append(floats)
    return floats_split

def stringToList(vectors):
    bracket_removed_mol2vec = []
    for i in range(len(vectors)):
        new_strings = vectors[i].replace('[', '')
        newer_strings = new_strings.replace(']', '')
        bracket_removed_mol2vec.append(newer_strings)

    xList = []
    for i in range(len(bracket_removed_mol2vec)):
        float_vec = str2float(bracket_removed_mol2vec[i])
        xList.append(float_vec)
    
    return xList

def listToString(vectors):
    string_indices = []
    for i in range(len(vectors)):
        knn_string = ', '.join(str(k) for k in vectors[i])
        string_indices.append(knn_string)

    bracket_string_indices = []
    for i in range(len(string_indices)):
        bracket_string = '[' + string_indices[i] + ']'
        bracket_string_indices.append(bracket_string)
    
    return bracket_string_indices

In [None]:
'''
Uploads information about detected species

Inputs: None
Outputs: List of feature vectors, log 10 column densities, column densities and SMILES strings
of each detected molecule
'''

def uploadIsoFinal():
    fullPath = os.path.join(os.getcwd(), 'all_files/Updated_Smiles/detectionDataset.csv')
    fullUpload = pd.read_csv(fullPath)


    # Extract mol2vec
    exactList = list(fullUpload['Exact'])
    #print(exactList)
    smileList = list(fullUpload['smiles'])
    predictList = list(fullUpload['Predict'])
    #print(predictList)
    nList = list(fullUpload['N'])
    xList = list(fullUpload['mol2vecIsoSameNew'])
    xList = stringToList(xList)
    xListBoth = []
    xListTrain = []
    nBoth = []
    nTrain = []
    smilesBoth = []
    smilesTrain = []
    for i in range(len(exactList)):
        if exactList[i] == "Y" and predictList[i] == "Y":
            #print(0)
            xListBoth.append(xList[i])
            nBoth.append(nList[i])
            smilesBoth.append(smileList[i])
        elif exactList[i] == "Y" and predictList[i] != "Y":
            #print(1)
            xListTrain.append(xList[i])
            nTrain.append(nList[i])
            smilesTrain.append(smileList[i])

    
    nBoth = np.array(np.log10(nBoth))
    nTrain = np.array(np.log10(nTrain))
    
    return xListBoth, nBoth, smilesBoth, xListTrain, nTrain, smilesTrain


In [None]:
# Determine best hyperparameter combination through GridSearchCV

def best_params(X, y, model, parameters, scoring):
    "Performs a grid search and returns the best hyperparameters given a model, scaler, dictionary of hyperparameters, scoring function, and dataset"
    scaler = preprocessing.StandardScaler().fit(X)
    model = model
    pipeline = Pipeline([('scaler', scaler), ('model', model)])
    search = GridSearchCV(pipeline, scoring=scoring, param_grid=parameters, error_score = 'raise').fit(X, y)
    return search.best_params_

In [None]:
# Hyperparameter dictionaries from Kelvin's github, umda/notebooks/estimator_training/model_hparams.yml
RFR_parameters = {'model__n_estimators': [10, 20, 50, 80, 100, 125, 150, 200],
                  'model__max_depth': [None, 5, 10, 15, 20],
                  'model__max_leaf_nodes': [None, 5, 10, 15, 20,40],
                  'model__min_samples_leaf': [0.1, 0.3, 0.5, 13, 5, 10, 15, 20, 25, 35],
                  'model__max_features': [0.1, 0.2, 0.5, 0.7, 1.]}

RR_parameters = {'model__alpha': [1e-2, 5e-2, 1e-1, 5e-1, 1.]}
SVR_parameters = {'model__C': 10**np.linspace(1.5, 2., 20),
                  'model__epsilon': [1e-3, 1e-2, 1e-1, 1.],
                  'model__gamma' : ["auto", 0.05, 0.1],
                  'model__tol': [1e-5]}
KNN_parameters = {'model__n_neighbors': [2, 4, 10, 15, 30, 50, 70],
                  'model__metric': ["cosine", "euclidean"],
                  'model__weights': ["uniform", "distance"]}
GBR_parameters = {'model__learning_rate': 10 ** np.linspace(-3.0, 1.0, 5),
                  'model__n_estimators':   [5, 10, 30, 50, 80, 100, 125, 150, 200],
                  'model__subsample': [0.2, 0.4,  0.6, 0.8, 1.],
                  'model__max_depth': [1, 2, 3, 4, 5, 6]}
BR_parameters = {'model__tol': [1e-7],
                 'model__alpha_1': [1e5],
                 'model__alpha_2': [1e5],
                 'model__lambda_1': [1e5],
                 'model__lambda_2': [1e5],
                 'model__alpha_init': [100.],
                 'model__lambda_init': [ 100.]}

aList = [10,100,300,500]
lsList = [1,3,5,7]
sigList = [1e-7, 1e-5, 1e-3]
nlList = [0.1, 0.3, 0.5, 0.7]


GPR_parameters = {'model__alpha': [1e-10, 3e-5, 1e-3, 0.02, 10.],
                  'model__n_restarts_optimizer': [3, 5, 10,15,20],
                 'model__kernel': [RBF(length_scale=ls) + WhiteKernel(noise_level=nl) + DotProduct(sigma_0=sig) for a in aList for ls in lsList for sig in sigList for nl in nlList]}

In [None]:

'''
Splits data 80/20 into train and test sets, bootstraps to a total of 800 samples and scales
the resulting feature vectors. Ensures some molecules are in the train set because of their
isotopic composition. 

For example, since there is only one species that is substituted with 36-sulfur, we ensured that
this molecule was present in the training set. 

Returns: 

X_train_scaled: Scaled, bootstrapped feature vectors in training set
y_train_boot: bootstrapped column densities in training set
X_test_scaled:  Scaled, bootstrapped feature vectors in testing set
y_test_boot: bootstrapped column densities in testing set
X_tot_scaled: scaled, non-bootstrapped molecular feature vectors
nBoth: non-bootstrapped column densities
X_trainFinal: non-bootstrapped feature vectors in training set
y_trainFinal: non-bootstrapped column densities in training set
X_testFinal: non-bootstrapped feature vectors in testing set
y_testFinal: non-bootstrapped column densities in testing set

'''

def splitBootScaleFinal(trainSize = 0.8, testSize = 0.2, bootSize = 800):
    xListBoth, nBoth, smilesBoth, xListTrain, nTrain, smilesTrain = uploadIsoFinal()
    bootTrainSize = round(bootSize*trainSize)
    #print(bootTrainSize)
    bootTestSize = round(bootSize*testSize)
    X_train, X_test, y_train, y_test = train_test_split(xListBoth, nBoth, train_size = trainSize, test_size = testSize, random_state = 33)
    X_train = list(X_train)
    y_train = list(y_train)
    X_train = X_train + xListTrain
    for i in range(len(nTrain)):
        y_train.append(nTrain[i])

    #y_train = y_train + nTrain
    print(len(X_train), len(X_test))
    X_train = np.asarray(X_train)
    y_train = np.asarray(y_train)
    X_train_boot, y_train_boot = resampling(X_train, y_train, bootTrainSize, 0.5)
    X_test_boot, y_test_boot = resampling(X_test, y_test, bootTestSize, 0.5)

  
    X_trainFinal = []
    X_testFinal = []
    y_trainFinal = []
    y_testFinal = []
    xIso = list(xListBoth)
    idx = 0
    for i in xIso:
        inXTest = False
        for j in X_test:
            if np.array_equal(i,j) == True:
                #print("xTest")
                inXTest = True
                X_testFinal.append(i)
                y_testFinal.append(nBoth[idx])
                continue
        if inXTest == False:
            X_trainFinal.append(i)
            y_trainFinal.append(nBoth[idx])
        idx +=1

    #print(X_testFinal)
    #print(X_trainFinal)
    

    scaler = preprocessing.StandardScaler().fit(X_train_boot) #creating Standard Scaler Object 
    X_train_scaled = scaler.transform(X_train_boot) #Scaling X_train
    X_test_scaled = scaler.transform(X_test_boot) #Scaling X_test
    X_tot_scaled = scaler.transform(xListBoth)

    return X_train_scaled, y_train_boot, X_test_scaled, y_test_boot, X_tot_scaled, nBoth, X_trainFinal, y_trainFinal, X_testFinal, y_testFinal

In [None]:

'''
function that returns the optimized hyperparameters of the models to be trained.

Inputs: X and y training data used to optimize hyperparameters
Returns: Optimized hyperparameters
'''

def optimizeParam(X_train_sb, y_train_sb):
     
    model = Ridge()
    RR_best_params = best_params(X_train_sb, y_train_sb, model, scoring = 'neg_mean_squared_error', parameters=RR_parameters)
    print('Optimized Ridge parameters:', RR_best_params)

    model = SVR()
    SVR_best_params = best_params(X_train_sb, y_train_sb, model, scoring = 'neg_mean_squared_error', parameters=SVR_parameters)
    print('Optimized SVR parameters:', SVR_best_params)

    model = KNeighborsRegressor()
    KNN_best_params = best_params(X_train_sb, y_train_sb, model, scoring = 'neg_mean_squared_error', parameters=KNN_parameters)
    print('Optimized KNN parameters:', KNN_best_params)

    model = BayesianRidge()
    BR_best_params = best_params(X_train_sb, y_train_sb, model, scoring = 'neg_mean_squared_error', parameters=BR_parameters)
    print('Optimized Bayesian Ridge parameters:', BR_best_params)

    model = RandomForestRegressor()
    RFR_best_params = best_params(X_train_sb, y_train_sb, model, scoring = 'neg_mean_squared_error', parameters=RFR_parameters)
    print('Optimized Random Forest parameters:', RFR_best_params)
    
    model = GradientBoostingRegressor()
    GBR_best_params =  best_params(X_train_sb, y_train_sb, model, scoring = 'neg_mean_squared_error', parameters=GBR_parameters)
    print('Optimized GBR parameters:', GBR_best_params)
    
    kernel = RBF() + WhiteKernel() + DotProduct()
    model = GaussianProcessRegressor(kernel = kernel, random_state = 72)
    GPR_best_params = best_params(X_train_sb, y_train_sb, model, scoring = 'neg_mean_squared_error', parameters=GPR_parameters)
    print('Optimized GPR parameters:', GPR_best_params)
    
    return GBR_best_params, RR_best_params, SVR_best_params, KNN_best_params, BR_best_params, RFR_best_params, GPR_best_params

In [None]:
#optimizing hyperparameters
X_train_scaled, y_train_boot, X_test_scaled, y_test_boot, X_tot_scaled, nBoth, X_trainFinal, y_trainFinal, X_testFinal, y_testFinal = splitBootScaleFinal()
GBR_best_params, RR_best_params, SVR_best_params,KNN_best_params, BR_best_params, RFR_best_params, GPR_best_params = optimizeParam(X_trainFinal, y_trainFinal)


In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.gaussian_process.kernels import RBF, DotProduct, WhiteKernel, ConstantKernel as C, RationalQuadratic
import numpy as np


'''
These functions train and validate the models used for this project. For each:

Inputs: bootstrapped molecular feature vectors and column densities for both the train and test data,
the optimized hyperparemeters and the combined non-bootstrapped train and test feature vectors and column 
densities (xTot, yTot)

'''

def runRFR(X_train_boot, y_train_boot,X_test, y_test, RFR_best_params,xTot,yTot):
    model = RandomForestRegressor(n_estimators=RFR_best_params['model__n_estimators'], max_depth=RFR_best_params['model__max_depth'], max_features=RFR_best_params['model__max_features'], max_leaf_nodes=RFR_best_params['model__max_leaf_nodes'], min_samples_leaf=RFR_best_params['model__min_samples_leaf'], random_state = 5)
    result = model.fit(X_train_boot, y_train_boot)
    train_pred = result.predict(X_train_boot)
    train_error = mean_squared_error(y_train_boot, train_pred)
    test_pred = result.predict(X_test)
    test_error = mean_squared_error(y_test, test_pred)
    detectedPred = result.predict(xTot)
    lineStart = 13
    lineEnd = 20.5
    
    detectedMSE = mean_squared_error(yTot,detectedPred)
    detectedR2 = r2_score(yTot,detectedPred)

    X_train, X_test, y_train, y_test = train_test_split(xTot, yTot, train_size = 0.8, random_state = 85)
    testPredFinal = result.predict(X_test)
    trainPredFinal = result.predict(X_train)

    plt.figure()
    plt.scatter(yTot, detectedPred, color = 'blue', alpha=0.5)
    plt.plot(np.arange(lineStart,lineEnd), np.arange(lineStart, lineEnd), ls="--", alpha=0.4, color="k")
    #plt.plot([lineStart, lineEnd], [lineStart, lineEnd], 'black', color = 'r')
    plt.title('Random Forest Regression')
    plt.xlabel("Observed column density ($\log_{10}$ cm$^{-2}$)")
    plt.ylabel("Predicted Column Density ($\log_{10}$ cm$^{-2}$)")
    #plt.xlim(lineStart, lineEnd-1)
    #plt.ylim(lineStart, lineEnd-1)
    plt.annotate("$R^2$ = {:.3f}".format(detectedR2), (lineStart + 0.25, lineEnd - 2), fontsize = "small")
    plt.annotate("MSE = {:.3f}".format(detectedMSE), (lineStart + 0.25, lineEnd - 1.5), fontsize = "small")
    plt.show()

    return y_train, trainPredFinal, y_test, testPredFinal, detectedMSE, detectedR2

def runRR(X_train_boot, y_train_boot,X_test, y_test, RR_best_params,xTot,yTot):
    model = Ridge(alpha=RR_best_params['model__alpha'], random_state = 9)
    result = model.fit(X_train_boot, y_train_boot)
    train_pred = result.predict(X_train_boot)
    train_error = mean_squared_error(y_train_boot, train_pred)
    test_pred = result.predict(X_test)
    test_error = mean_squared_error(y_test, test_pred)
    detectedPred = result.predict(xTot)
    tuList = []
    for i in range(len(detectedPred)):
        if detectedPred[i] < 0:
            print("INVALID INDEX")
            print(i)
    differenceList = []
    for i in range(len(detectedPred)):
        difference = (detectedPred[i] - yTot[i])**2
        differenceList.append(difference)
    
    print("ARGMAX")
    print(np.argmax(differenceList))
            
    lineStart = 13
    lineEnd = 20.5
    
    detectedMSE = mean_squared_error(yTot,detectedPred)
    detectedR2 = r2_score(yTot,detectedPred)

    X_train, X_test, y_train, y_test = train_test_split(xTot, yTot, train_size = 0.8, random_state = 85)
    testPredFinal = result.predict(X_test)
    trainPredFinal = result.predict(X_train)


    plt.figure()
    plt.scatter(yTot, detectedPred, color = 'blue', alpha=0.5)
    plt.plot(np.arange(lineStart,lineEnd), np.arange(lineStart, lineEnd), ls="--", alpha=0.4, color="k")
    #plt.plot([lineStart, lineEnd], [lineStart, lineEnd], 'black', color = 'r')
    plt.title('Ridge Regression')
    plt.xlabel("Observed column density ($\log_{10}$ cm$^{-2}$)")
    plt.ylabel("Predicted Column Density ($\log_{10}$ cm$^{-2}$)")
    #plt.xlim(lineStart, lineEnd-1)
    #plt.ylim(lineStart, lineEnd-1)
    plt.annotate("$R^2$ = {:.3f}".format(detectedR2), (lineStart + 0.25, lineEnd - 2), fontsize = "small")
    plt.annotate("MSE = {:.3f}".format(detectedMSE), (lineStart + 0.25, lineEnd - 1.5), fontsize = "small")
    plt.show()

    return y_train, trainPredFinal, y_test, testPredFinal, detectedMSE, detectedR2


def runSVR(X_train_boot, y_train_boot,X_test, y_test, SVR_best_params,xTot,yTot):
    model = SVR(C=SVR_best_params['model__C'], epsilon=SVR_best_params['model__epsilon'], tol=SVR_best_params['model__tol'])
    result = model.fit(X_train_boot, y_train_boot)
    train_pred = result.predict(X_train_boot)
    train_error = mean_absolute_error(y_train_boot, train_pred)
    test_pred = result.predict(X_test)
    test_error = mean_squared_error(y_test, test_pred)
    detectedPred = result.predict(xTot)
    lineStart = 13
    lineEnd = 20.5

    X_train, X_test, y_train, y_test = train_test_split(xTot, yTot, train_size = 0.8, random_state = 85)
    testPredFinal = result.predict(X_test)
    trainPredFinal = result.predict(X_train)

    detectedMSE = mean_squared_error(yTot,detectedPred)
    detectedR2 = r2_score(yTot,detectedPred)

    plt.figure()
    plt.scatter(y_train, trainPredFinal, color = 'blue', alpha=0.5, label = 'Train')
    plt.scatter(y_test, testPredFinal, color = 'black', alpha=0.5, label = 'Test')
    plt.plot(np.arange(lineStart,lineEnd), np.arange(lineStart, lineEnd), ls="--", alpha=0.4, color="k")
    plt.legend()
    #plt.plot([lineStart, lineEnd], [lineStart, lineEnd], 'black', color = 'r')
    plt.title('Support Vector Regression')
    plt.xlabel("Observed column density ($\log_{10}$ cm$^{-2}$)")
    plt.ylabel("Predicted Column Density ($\log_{10}$ cm$^{-2}$)")
    #plt.xlim(lineStart, lineEnd-1)
    #plt.ylim(lineStart, lineEnd-1)
    plt.annotate("$R^2$ = {:.3f}".format(detectedR2), (lineStart + 0.25, lineEnd - 2), fontsize = "small")
    plt.annotate("MSE = {:.3f}".format(detectedMSE), (lineStart + 0.25, lineEnd - 1.5), fontsize = "small")
    plt.show()

    return y_train, trainPredFinal, y_test, testPredFinal, detectedMSE, detectedR2

    
def runBR(X_train_boot, y_train_boot,X_test, y_test, BR_best_params,xTot,yTot):
    
    model = BayesianRidge(tol=BR_best_params['model__tol'], alpha_1=BR_best_params['model__alpha_1'], alpha_2=BR_best_params['model__alpha_2'], lambda_1=BR_best_params['model__lambda_1'], lambda_2=BR_best_params['model__lambda_2'], alpha_init=BR_best_params['model__alpha_init'], lambda_init=BR_best_params['model__lambda_init'])
    result = model.fit(X_train_boot, y_train_boot)
    train_pred = result.predict(X_train_boot)
    train_error = mean_squared_error(y_train_boot, train_pred)
    test_pred = result.predict(X_test)
    test_error = mean_squared_error(y_test, test_pred)
    detectedPred = result.predict(xTot)
    lineStart = 13
    lineEnd = 20.5

    X_train, X_test, y_train, y_test = train_test_split(xTot, yTot, train_size = 0.8, random_state = 85)
    testPredFinal, BR_testSD = result.predict(X_test, return_std=True)
    trainPredFinal, BR_trainSD = result.predict(X_train, return_std=True)

    detectedMSE = mean_squared_error(yTot,detectedPred)
    detectedR2 = r2_score(yTot,detectedPred)

    plt.figure()
    plt.scatter(yTot, detectedPred, color = 'blue', alpha=0.5)
    plt.plot(np.arange(lineStart,lineEnd), np.arange(lineStart, lineEnd), ls="--", alpha=0.4, color="k")
    #plt.plot([lineStart, lineEnd], [lineStart, lineEnd], 'black', color = 'r')
    plt.title('Bayesian Ridge Regression')
    plt.xlabel("Observed column density ($\log_{10}$ cm$^{-2}$)")
    plt.ylabel("Predicted Column Density ($\log_{10}$ cm$^{-2}$)")
    #plt.xlim(lineStart, lineEnd-1)
    #plt.ylim(lineStart, lineEnd-1)
    plt.annotate("$R^2$ = {:.3f}".format(detectedR2), (lineStart + 0.25, lineEnd - 2), fontsize = "small")
    plt.annotate("MSE = {:.3f}".format(detectedMSE), (lineStart + 0.25, lineEnd - 1.5), fontsize = "small")
    plt.show()

    return y_train, trainPredFinal, y_test, testPredFinal, detectedMSE, detectedR2, BR_trainSD, BR_testSD


def runKNN(X_train_boot, y_train_boot,X_test, y_test, KNN_best_params,xTot,yTot):
    model = KNeighborsRegressor(n_neighbors=KNN_best_params['model__n_neighbors'], metric=KNN_best_params['model__metric'], weights=KNN_best_params['model__weights'])
    result = model.fit(X_train_boot, y_train_boot)
    train_pred = result.predict(X_train_boot)
    train_error = mean_squared_error(y_train_boot, train_pred)
    test_pred = result.predict(X_test)
    test_error = mean_squared_error(y_test, test_pred)
    detectedPred = result.predict(xTot)
    lineStart = 13
    lineEnd = 20.5


    X_train, X_test, y_train, y_test = train_test_split(xTot, yTot, train_size = 0.8, random_state = 85)
    testPredFinal = result.predict(X_test)
    trainPredFinal = result.predict(X_train)


    detectedMSE = mean_squared_error(yTot,detectedPred)
    detectedR2 = r2_score(yTot,detectedPred)

    plt.figure()
    plt.scatter(yTot, detectedPred, color = 'blue', alpha=0.5)
    plt.plot(np.arange(lineStart,lineEnd), np.arange(lineStart, lineEnd), ls="--", alpha=0.4, color="k")
    #plt.plot([lineStart, lineEnd], [lineStart, lineEnd], 'black', color = 'r')
    plt.title('K Nearest Neighbors Regression')
    plt.xlabel("Observed column density ($\log_{10}$ cm$^{-2}$)")
    plt.ylabel("Predicted Column Density ($\log_{10}$ cm$^{-2}$)")
    #plt.xlim(lineStart, lineEnd-1)
    #plt.ylim(lineStart, lineEnd-1)
    plt.annotate("$R^2$ = {:.3f}".format(detectedR2), (lineStart + 0.25, lineEnd - 2), fontsize = "small")
    plt.annotate("MSE = {:.3f}".format(detectedMSE), (lineStart + 0.25, lineEnd - 1.5), fontsize = "small")
    plt.show()

    return y_train, trainPredFinal, y_test, testPredFinal, detectedMSE, detectedR2

def runGBR(X_train_boot, y_train_boot,X_test, y_test, GBR_best_params,xTot,yTot):
    model = GradientBoostingRegressor(max_depth=GBR_best_params['model__max_depth'], n_estimators=GBR_best_params['model__n_estimators'], learning_rate = GBR_best_params['model__learning_rate'], subsample= GBR_best_params['model__subsample'])
    result = model.fit(X_train_boot, y_train_boot)
    train_pred = result.predict(X_train_boot)
    train_error = mean_squared_error(y_train_boot, train_pred)
    test_pred = result.predict(X_test)
    test_error = mean_squared_error(y_test, test_pred)
    detectedPred = result.predict(xTot)
    lineStart = 13
    lineEnd = 20.5


    X_train, X_test, y_train, y_test = train_test_split(xTot, yTot, train_size = 0.8, random_state = 85)
    testPredFinal = result.predict(X_test)
    trainPredFinal = result.predict(X_train)

    detectedMSE = mean_squared_error(yTot,detectedPred)
    detectedR2 = r2_score(yTot,detectedPred)

    plt.figure()
    plt.scatter(yTot, detectedPred, color = 'blue', alpha=0.5)
    plt.plot(np.arange(lineStart,lineEnd), np.arange(lineStart, lineEnd), ls="--", alpha=0.4, color="k")
    #plt.plot([lineStart, lineEnd], [lineStart, lineEnd], 'black', color = 'r')
    plt.title('Gradient Boosting Regression')
    plt.xlabel("Observed column density ($\log_{10}$ cm$^{-2}$)")
    plt.ylabel("Predicted Column Density ($\log_{10}$ cm$^{-2}$)")
    #plt.xlim(lineStart, lineEnd-1)
    #plt.ylim(lineStart, lineEnd-1)
    plt.annotate("$R^2$ = {:.3f}".format(detectedR2), (lineStart + 0.25, lineEnd - 2), fontsize = "small")
    plt.annotate("MSE = {:.3f}".format(detectedMSE), (lineStart + 0.25, lineEnd - 1.5), fontsize = "small")
    plt.show()

    return y_train, trainPredFinal, y_test, testPredFinal, detectedMSE, detectedR2

def runGPR(X_train_boot, y_train_boot,X_test, y_test, GPR_best_params,xTot,yTot):
    #model = GaussianProcessRegressor(alpha=GPR_best_params['model__alpha'], n_restarts_optimizer=GPR_best_params['model__n_restarts_optimizer'])
    kernel = RationalQuadratic(alpha=347, length_scale=4.22) + DotProduct(sigma_0=1.61e-05) + WhiteKernel(noise_level=0.478)
    #kernel = C(constant_value=1.0, constant_value_bounds=(0.0, 10.0)) * RBF(length_scale=0.5, length_scale_bounds=(0.0, 10.0)) + RBF(length_scale=2.0, length_scale_bounds=(0.0, 10.0))
    #kernel = DotProduct() + WhiteKernel()
    #kernel = 2.24**2 * RBF(length_scale=2) * DotProduct(sigma_0=0.5) + RBF(length_scale=3)
    #kernel = C(1., (1e-1, 1e1))* RBF(10, (1e0, 1e2))
    
    #kernel, random_state=215016, n_restarts_optimizer=0, alpha=1e-2, normalize_y=True
    
    
# This is a poorly designed model; I manually fiddled around with the kernel definition and parameters
# until I got bored, and that the test sccore was reasonable
    #kernel = ConstantKernel(5., (1e-3, 1e5)) * RBF(2., (1e-2, 1e1)) * DotProduct(0.5, (1e-3, 1e1)) + RBF(3., (1e-3, 1e2))
    #gp_model = GaussianProcessRegressor(kernel, alpha=1e-3, normalize_y=True, random_state=42)
    
    model = GaussianProcessRegressor(alpha=1e-10, kernel=kernel,normalize_y=True, random_state=55, n_restarts_optimizer= 20)
    result = model.fit(X_train_boot, y_train_boot)
    train_pred = result.predict(X_train_boot)
    train_error = mean_squared_error(y_train_boot, train_pred)
    test_pred = result.predict(X_test)
    test_error = mean_squared_error(y_test, test_pred)
    detectedPred = result.predict(xTot)
    lineStart = 13
    lineEnd = 20.5

    X_train, X_test, y_train, y_test = train_test_split(xTot, yTot, train_size = 0.8, random_state = 85)
    testPredFinal, testSD = result.predict(X_test, return_std=True)
    trainPredFinal, trainSD = result.predict(X_train, return_std=True)


    detectedMSE = mean_squared_error(yTot,detectedPred)
    detectedR2 = r2_score(yTot,detectedPred)

    plt.figure()
    plt.scatter(yTot, detectedPred, color = 'blue', alpha=0.5)
    plt.plot(np.arange(lineStart,lineEnd), np.arange(lineStart, lineEnd), ls="--", alpha=0.4, color="k")
    #plt.plot([lineStart, lineEnd], [lineStart, lineEnd], 'black', color = 'r')
    plt.title('Gaussian Process Regression')
    plt.xlabel("Observed column density ($\log_{10}$ cm$^{-2}$)")
    plt.ylabel("Predicted Column Density ($\log_{10}$ cm$^{-2}$)")
    #plt.xlim(lineStart, lineEnd-1)
    #plt.ylim(lineStart, lineEnd-1)
    plt.annotate("$R^2$ = {:.3f}".format(detectedR2), (lineStart + 0.25, lineEnd - 2), fontsize = "small")
    plt.annotate("MSE = {:.3f}".format(detectedMSE), (lineStart + 0.25, lineEnd - 1.5), fontsize = "small")
    plt.show()

    return y_train, trainPredFinal, y_test, testPredFinal, detectedMSE, detectedR2, testSD, trainSD

In [None]:

'''
This function creates the train/test performance graphic seen in the text.

Inputs: optimized hyperparameters
Returns: None
'''

def getPredictionGraph(RR_best_params, SVR_best_params, KNN_best_params, BR_best_params, RFR_best_params, GBR_best_params, GPR_best_params):
    

    kernel = RBF(length_scale=7) + WhiteKernel(noise_level=0.3, noise_level_bounds = (1e-10,1e5)) + DotProduct(sigma_0=0.001, sigma_0_bounds = (1e-10,1e5))
    GPR_model = GaussianProcessRegressor(alpha=1e-10, kernel=kernel,normalize_y=True, random_state=32, n_restarts_optimizer= 20)
    LR_model = LinearRegression()
    RFR_model = RandomForestRegressor(n_estimators=RFR_best_params['model__n_estimators'], max_depth=RFR_best_params['model__max_depth'], max_features=RFR_best_params['model__max_features'], max_leaf_nodes=RFR_best_params['model__max_leaf_nodes'], min_samples_leaf=RFR_best_params['model__min_samples_leaf'], random_state = 39)
    RR_model = Ridge(alpha=RR_best_params['model__alpha'], random_state = 15)
    SVR_model = SVR(C=SVR_best_params['model__C'], epsilon=SVR_best_params['model__epsilon'], tol=SVR_best_params['model__tol'])
    BR_model = BayesianRidge(tol=BR_best_params['model__tol'], alpha_1=BR_best_params['model__alpha_1'], alpha_2=BR_best_params['model__alpha_2'], lambda_1=BR_best_params['model__lambda_1'], lambda_2=BR_best_params['model__lambda_2'], alpha_init=BR_best_params['model__alpha_init'], lambda_init=BR_best_params['model__lambda_init'])
    KNN_model = KNeighborsRegressor(n_neighbors=KNN_best_params['model__n_neighbors'], metric=KNN_best_params['model__metric'], weights=KNN_best_params['model__weights'])
    GBR_model = GradientBoostingRegressor(max_depth=GBR_best_params['model__max_depth'], n_estimators=GBR_best_params['model__n_estimators'], learning_rate = GBR_best_params['model__learning_rate'], subsample= GBR_best_params['model__subsample'])

    GPR_y_train, GPR_train_pred, GPR_y_test, GPR_test_pred, mse_GPR, r2_GPR, GPR_testSD, GPR_trainSD = runPredictionsSD(GPR_model)
    LR_y_train, LR_train_pred, LR_y_test, LR_test_pred, mse_LR, r2_LR = runPredictions(LR_model)
    RR_y_train, RR_train_pred, RR_y_test, RR_test_pred, mse_RR, r2_RR = runPredictions(RR_model)
    BR_y_train, BR_train_pred, BR_y_test, BR_test_pred, mse_BR, r2_BR, BR_testSD, BR_trainSD = runPredictionsSD(BR_model)
    KNN_y_train, KNN_train_pred, KNN_y_test, KNN_test_pred, mse_KNN, r2_KNN = runPredictions(KNN_model)
    RFR_y_train, RFR_train_pred, RFR_y_test, RFR_test_pred, mse_RFR, r2_RFR = runPredictions(RFR_model)
    SVR_y_train, SVR_train_pred, SVR_y_test, SVR_test_pred, mse_SVR, r2_SVR = runPredictions(SVR_model)
    GBR_y_train, GBR_train_pred, GBR_y_test, GBR_test_pred, mse_GBR, r2_GBR = runPredictions(GBR_model)
    
    #neural network data (from Pytorch)
    
    NN_yTest = [15.919154394339389, 15.849458625797741, 16.935144192672173, 16.69216422828302, 16.49136465623307, 16.692602284122202, 16.016758426548687, 17.299666487382407, 16.810816417560304, 16.961536388698566, 14.63028651851707, 16.984374500364794, 14.02813622568437, 16.336188601372427, 16.868407011652817, 17.358516318270418, 15.77225877967236, 16.193113555005787, 16.63323865878684, 16.292997806313842, 16.703122887404838, 16.338418050422728, 16.716775097593153, 16.840269785908628, 15.186859669132943, 16.041662544134205, 15.535941181223532, 18.21812197452038, 17.21296050941837, 15.830691104059609, 15.027629321398312, 17.507809646467443, 16.116577259214345, 16.57379473583805, 16.30051782081765, 16.45380700361194, 15.390878578017247, 15.576976451098542, 13.882386025927604, 16.153977214693228, 16.16028732298799, 13.89253363326706, 14.262345383231319, 17.090710511518516, 15.67770210799634, 16.4970289428013, 16.048294930157834, 14.96925779867635, 16.043874129978615, 16.474407258735578, 16.7665541211576, 15.939877962045918, 16.223736623079056, 18.54229513427555, 15.859380837209564, 14.59258964952934, 15.606391200397153, 17.084152467947003, 14.261878023371107, 14.816976311836315, 17.033470449317825, 15.34137836149357, 14.065706426035764, 15.254693407948242, 16.712355346816672, 14.413701811220728, 15.794554313963536, 15.604395088689378, 14.232533443300357, 17.5457030768914, 16.207863760487317, 17.168478107239327, 16.82940014047372, 16.83010619416633, 16.44919935221628, 14.624724925722711, 16.450558458803854, 17.69452729841302, 15.65016514744818, 18.679275066905497, 13.447115187074612, 16.398720477686403, 17.32926134201897, 17.27866341984103, 15.555336610466695, 14.690721518600986, 16.41630848506638, 14.465796212810801, 17.63808905914515, 14.38605253319364, 14.265268530060908, 17.53393338453062, 14.446148833603555, 16.244807183056146, 16.36885084060647, 16.989054394916714, 15.340933135824256, 16.543854075985248, 18.814579518711717, 16.736198877020545, 15.57391383323158, 14.016145671100473, 17.1303865295953, 15.381909793806965, 16.750653911144585, 16.593414950445176, 14.180767375404585, 14.50620430729082, 18.080296282677548, 16.597407678285084, 14.21509211315657, 13.673886243358675, 16.657453219742745, 16.02734963942873, 16.194703030051507, 14.260211101778232, 16.26763697146533, 14.179468923578979, 16.12821982808926, 16.64099905704884, 16.94172055011879, 16.66384491954264, 15.432739107434942, 15.181517096354881, 17.248096600692133, 16.37554066983149, 17.639966743503336, 17.548426646376274, 18.43403125089425, 14.174248761811159, 17.042092858205873, 16.982147675825313, 14.743926173030165, 15.777483327421374, 16.52670297149753, 14.917847271034127, 13.435206068072372, 18.0063496166239, 16.5141542386674, 16.377365724975235, 17.024146003050994, 14.695493511363079, 16.53133545815386, 15.338353322857225, 16.05590065288647, 17.005790143489026, 16.818232530583124, 17.43675287051839, 17.14027093670495, 16.361817962978026, 14.614372250230119, 16.646985478766556, 16.33609881172558, 16.692322189193366, 15.539219827257837, 17.711267408524556, 16.2058674344233, 15.020268536258088, 16.78639268781219, 16.164493315670644]
    NN_yTestPred = [17.489508, 16.725586, 16.086483, 14.581125, 16.288195, 15.672956, 15.463974, 15.7293, 16.63516, 15.186163, 16.962688, 15.88216, 16.501038, 16.383446, 15.631968, 15.989613]
    NN_yTrain = [16.04296796507631, 14.797989464006212, 13.825788725868012, 16.292690731707868, 15.319705158311242, 15.773231935026526, 17.01638171196896, 15.478307407063106, 14.301291633724754, 16.63192411306399, 14.131376146340143, 14.398060082096437, 13.826056328837918, 13.78966747451144, 14.511782833779522, 17.9532321655568, 15.230559664926226, 14.257536989478504, 15.84589438198596, 16.479271517199138, 16.703102703993743, 15.636166034915675, 16.068908281661866, 13.955039780482844, 15.971480035110623, 14.588869266676092, 14.792512498088673, 15.25826291282955, 16.273632607267725, 14.604946177817858, 15.640221038953703, 14.423406097839026, 16.622075211177894, 15.753168354935873, 14.826632758747223, 15.082738291952, 15.938054684314146, 14.278735734929292, 15.874764002913185, 15.045635678792195, 13.885292891819482, 16.052655944895047, 15.191299124379732, 14.542374916465521, 17.06542359246281, 15.757764120708078, 15.055319662941859, 15.721632993851395, 17.555543129989946, 16.241118658707634, 16.038813045669265, 17.0935426309041, 16.29010357930149, 14.125521215924095, 15.174346337162724, 18.256981395768282, 14.83148160222744, 16.228076483014775, 14.570009346824241, 14.62971315340539, 17.200349214044028, 15.094240305679238, 19.693780562157187, 16.037490526322646, 13.381747867705448, 13.77227143309252, 14.327733615720419, 14.858024347644777, 16.036849249018605, 15.065543839113516, 14.708858026538326, 13.283011996317192, 17.374962646572726, 15.815160923819084, 16.27191687068542, 17.574367269382392, 16.57332613075548, 14.501233341532718, 13.278349898643002, 15.779404884468088, 15.027981858904564, 15.586838502811798, 15.486086207581337, 13.955020241450931, 15.085529570368166, 15.943804994529962, 14.652745735976778, 16.45937080293184, 14.48846580986218, 17.48422272730813, 14.172626242384753, 17.415252554209058, 14.475785342688617, 15.740127255506437, 16.67883629338479, 14.424592119986055, 17.687941825830517, 16.618155129620664, 14.958787934025917, 15.756147875195653, 15.6160575972938, 15.17562841839745, 16.56348572999676, 16.31737310816609, 17.578966630071555, 14.604661967639384, 15.287233510167582, 18.12361016514816, 15.680774770141808, 15.062651237223632, 14.316181262820958, 15.175613493893595, 14.884476211395166, 15.727838465007856, 15.203138219697017, 16.21234462591355, 14.820978185654726, 14.1989619308112, 15.255439068941143, 14.765423296821051, 16.246846260031234, 15.431710717376705, 16.064852926547047, 14.996990190477868, 16.17546647853207, 14.981564949174166, 15.864830530695626, 15.186437187375118, 16.16308135155581, 14.35896059079867, 16.308048533248808, 17.126238311775722, 14.513317528651207, 13.452087531982269, 16.815185189484904, 15.890707444647466, 17.560864701443027, 14.937133455528683, 14.594879901199187, 16.146440501590778, 17.076460392370503, 18.78390041837801, 15.885352650320188, 14.704170257402168, 15.800504039700952, 15.495660771571647, 16.917859257962284, 15.82626328679579, 15.94851332153808, 14.253622744569332, 15.017156546875318, 16.195483786361518, 16.013138329598874, 16.621220657938096, 15.083289417275509, 16.620510868037858, 14.75452700145815, 16.269103390079394, 14.568773505907135, 16.994795897874344, 19.07281922299677, 16.180670643838866, 18.895631580841602, 16.26029493028836, 15.535739641945712, 17.553832224927596, 17.152940868908733, 15.868374018380848, 16.673108672071262, 14.787173954482242, 14.897416231530567, 17.036393045108177, 15.844791976302691, 14.060894465281246, 15.627285670558376, 16.356512354778694, 16.747144399468308, 14.71836173579452, 15.653840185741375, 13.862282112175848, 14.94365624044603, 16.556784346849636, 15.128568474298353, 15.574471068866579, 17.422287876435142, 18.720748051603618, 16.361702387611274, 14.540091822185786, 15.584906006681935, 15.64267099707467, 18.76981995049567, 15.612582238127185, 14.668914084783196, 17.019788497745857, 15.283053559932675, 16.21361987742765, 15.348621759242247, 15.08115322720438, 15.137470124507836, 16.516300441827024, 15.527953813674403, 14.345100737620443, 15.686196664143457, 14.302813991945865, 14.044018316700434, 16.48895858865718, 13.59161719380332, 16.62438010571819, 14.222503405127895, 15.546031233228545, 16.354937319723923, 19.725993532175092, 17.032063555243088, 19.399052816035685, 16.811920568260017, 14.872322334929965, 15.663807851854035, 13.501772117441396, 18.496310372604324, 14.826661021446743, 16.103499605069885, 14.708344283815265, 15.66701773309946, 15.86172706262227, 15.751767808471119, 17.172483586649303, 16.01997142904198, 16.37886017972945, 15.275267947907352, 14.161282217533653, 12.964022883611921, 14.361847038462264, 17.407336006108757, 16.43488708298186, 14.57917503217969, 15.270858841486547, 15.185499940706208, 15.7515547039472, 13.99692983363709, 16.043585181428817, 19.56935307363436, 16.452731012385158, 16.32151791930264, 14.521151498076293, 15.599044610952566, 15.231036103114926, 13.046081043305994, 14.176090418869455, 13.984017430909983, 16.784162120472693, 18.330967375402935, 15.053503391014889, 15.56200252613594, 14.948695947523856, 17.535442619877887, 14.587763282403674, 14.457273887503453, 13.808963845811654, 16.80576730772114, 15.66684194920294, 15.431682634566576, 14.936044152938342, 15.214194228978954, 14.904777652482508, 15.733162851564778, 14.614378882591948, 14.793115898915927, 14.381449493301352, 15.835170804176208, 16.40549804790385, 16.057312339977234, 16.005138915985953, 15.08433819566353, 15.157250787134213, 15.556033439047663, 14.852122681613752, 13.777264431721544, 14.281026995599705, 17.160147080472083, 15.409048751778899, 16.098863058150055, 17.370088766522816, 16.686389114995663, 16.827314110711203, 16.7895866186575, 17.054322215372583, 17.80990109508749, 15.21176979468209, 14.845252802699573, 14.925413840690412, 16.718430771066966, 17.161850203267807, 17.718859087515717, 14.596059856059261, 16.740904030799786, 14.548610574483897, 16.041062913459292, 18.721380204765115, 14.515803588174451, 17.050232109376957, 14.444418963428966, 14.14237745298709, 13.476003784321424, 16.474732714809942, 12.95694495371676, 16.66829914854487, 16.233373282979287, 15.722474771128562, 14.607475774574942, 15.458013668533475, 15.84356768629229, 13.90394657043056, 16.150774908529918, 17.076119527278745, 15.169199716892793, 16.42914950594655, 16.31400022374709, 14.255000639451767, 16.575371983216733, 15.362744834619987, 17.171606825649413, 12.514723170673143, 15.036612030959372, 16.08291656921453, 17.277504030794624, 13.425550695869237, 17.406751618346078, 15.506071435672418, 13.660649822030258, 15.194897478927365, 16.33038085739223, 13.810472653562545, 14.86380795382715, 17.13700495690826, 16.590231203573595, 13.736226086048752, 16.647075947084737, 16.813093682801878, 16.315678590718225, 14.749457636467534, 15.228405864644966, 13.730813542012285, 15.71980936162017, 15.842982638942843, 16.315282941174015, 14.197351302087293, 15.343563163609112, 17.074759865819416, 16.062297851655345, 15.186618824796648, 16.033148695919653, 16.942973994480806, 14.292356862422476, 15.12071165448362, 12.75734694520874, 15.931876888533212, 15.002872219230904, 15.196974046818312, 13.700706010218827, 15.06140976911117, 14.946283202545173, 15.24262858167617, 15.810411732485367, 16.591368631593998, 16.112331148052192, 14.110441001271816, 15.325965532518602, 15.019514412404574, 16.24336239780054, 15.857398986166736, 16.890865684493928, 14.80885408957603, 15.966108588308577, 15.540909705097858, 14.674986197011114, 15.59373784646181, 15.840958765622037, 14.852804645805307, 16.04426325505863, 13.399015441441078, 15.924494864337648, 16.196627710685092, 17.545584528899976, 17.244972487188203, 16.40601786812049, 14.239817479914384, 14.872370595322534, 13.443886496812874, 16.54828568545381, 17.020377489236125, 15.691560749113233, 15.376124953139223, 16.111739835584544, 14.609042910947334, 13.968006767874392, 15.480327468437306, 16.645475287841947, 16.970223264847483, 16.554251026226783, 14.7150274371201, 14.728757137359654, 17.72286522202069, 19.993969736665598, 14.770408122142452, 15.143007803959227, 17.132126990051404, 14.517269817037, 15.613132573628995, 14.839538712894997, 13.926737082068556, 15.956154672188653, 14.692280670783294, 16.337236286732438, 14.974945543247186, 14.292938586235094, 14.226759156309397, 14.732003684774313, 15.100419320302436, 17.970054767879144, 14.88494623113498, 17.51150254105305, 17.399888904933963, 14.886472793989466, 14.728853022692334, 15.719683703347622, 14.992605428036825, 12.954785068328555, 16.1094397218599, 17.095041046834194, 15.827170640746365, 15.392330931995424, 14.799559842691172, 17.50924375139007, 16.978354880878122, 14.829221755121985, 14.06494075533201, 16.33656039144905, 14.375726772768893, 14.931671589725788, 13.906050854983215, 15.203845409060595, 14.905743943465021, 15.814247947173776, 14.7090323983936, 15.385273893078251, 15.181319440791121, 16.352956769384893, 13.87962546947242, 15.311474614355573, 14.762211438115344, 15.957153953549954, 15.654708549166799, 15.728190792944472, 14.621234689729311, 15.879518928248586, 18.76300994543742, 14.4962630173607, 15.981145979275638, 17.32974030425735, 15.379601288124613, 15.97586220971258, 14.804880888184405, 15.430791020033835, 14.477576694539664, 13.357308349664523, 13.578343499140292, 15.812709048790726, 15.624572615871674, 17.203998905727957, 16.15158066763902, 17.510419609255568, 17.250458033856212, 15.28408191062794, 14.857770720862575, 15.427108809139948, 15.634958612276215, 14.308138393605176, 15.258980715456985, 16.596430924991417, 14.720117992534238, 14.041809697571798, 13.934190884452208, 16.246312945716205, 16.461693325569033, 14.967981322821055, 16.488114946405542, 16.060771925786455, 16.576142717375138, 14.986046344174769, 16.291684288309252, 16.26589204048685, 15.152521155481875, 14.931654001004292, 14.329959419984064, 14.311707379151832, 15.950873134236204, 17.394691149547274, 15.1110270369933, 19.292992728962783, 16.644478591098007, 14.823451947079572, 15.098048948303084, 15.56202297628825, 14.153101036535231, 14.46224805588875, 15.16026799051127, 15.134569622163477, 15.887259219805072, 15.10183405134772, 13.817813720500535, 19.785068140911754, 14.625849863670549, 15.89030001512909, 14.36410988452941, 16.183843479021164, 16.295432437644667, 15.21840663813749, 16.488541492689777, 17.3011659292718, 13.774719448510538, 17.045220449684585, 16.556635244676652, 17.685976332868954, 17.53198588788242, 18.484268874734, 15.756968944876894, 14.183316194418998, 16.5666712311908, 15.118630975627187, 13.967193456093419, 14.84193206179571, 14.378812586249575, 15.185081137402896, 17.477739859198316, 14.329741989033959, 15.822293131840832, 12.106162564564016, 14.386681260180213, 14.975264354799428, 13.37864085917884, 15.128866143204373, 16.631889890584933, 15.595576012556855, 14.39694345108398, 16.345838872341993, 14.457329014167508, 15.26650521589831, 15.462378512192505, 15.075995651759282, 16.193520098194234, 17.039182624065383, 15.416863568464953, 15.233866285582126, 16.232021509562106, 14.05493252422988, 15.027423003215603, 16.417727957561834, 14.825365855015432, 18.429659176458205, 15.969973345722915, 15.096746768609925, 14.251242197834939, 13.897098209157551, 14.714637106626087, 14.002477438079085, 15.174451546379341, 15.318541943967794, 15.792812643705636, 16.039754902335623, 15.750431610354228, 16.638061762883222, 14.64994131817154, 15.353395643961868, 17.911422096833615, 14.500278273361815, 15.736462124365037, 17.408459200823597, 15.057802209940368, 13.835187674077487, 16.377040066954564, 15.119656286543988, 15.491492689824971, 14.471828492967152, 14.665839240627195, 16.735586656335208, 16.41530225848595, 13.912076067382918, 13.903736220381523, 16.07942435113168, 16.953879814545616, 14.807290278304556, 17.101042588250426, 16.413060861824864, 17.796289202724786, 14.460117091026477, 15.426133697011178, 16.649717917461544, 15.707678853956104, 13.093264903128226, 17.18110231605795, 15.803395798148058, 15.125879050927649, 15.581994910633062, 13.784851658789528, 16.459669129445533, 15.56409812451676, 13.655744084839439, 14.816619662786767, 14.345278958741922, 14.648577228988973, 14.981828468479023, 18.85688277649595, 14.718652344589088, 16.05874973329996, 14.05037943802057, 16.19524299448189, 13.031849219701796, 16.043851796158894, 17.246500835192556, 15.022565534811205, 14.75652768025172, 16.780650501460297, 16.122869227525854, 14.925837896045664, 14.923779703548277, 16.549662818363505, 15.783231191244106, 15.632091491440029, 15.451088847882282, 17.055295625318994, 14.62420294329175, 15.418883955798918, 15.136766915136779, 14.221726099562689, 15.363675733682582, 14.879887001228896, 14.765177755115038, 16.078047727905904, 13.959123127447771, 15.591815397110125]
    NN_yTrainPred = [18.845308, 18.065077, 17.1111, 16.25874, 15.240286, 16.876133, 15.775425, 16.511442, 16.624157, 15.1688175, 15.877937, 15.821308, 16.274254, 16.714586, 14.447393, 16.757126, 14.425144, 16.880219, 17.461481, 15.736553, 15.508495, 15.109901, 14.750093, 14.755886, 14.291061, 15.919634, 16.23969, 14.946749, 15.5934105, 15.421957, 14.597017, 14.869045, 15.435856, 16.942339, 16.12819, 14.434273, 14.65575, 14.791714, 14.162077, 15.111671, 14.53401, 16.035194, 16.153631, 14.973793, 14.280375, 13.909029, 15.984831, 14.970785, 16.51126, 17.17737, 16.009817, 15.179459, 15.966677, 16.01131, 14.962791, 15.101761, 14.338171, 14.341158, 14.48621, 15.494148, 15.433002, 15.348953]
    
    
    totalReport = NN_yTrain + NN_yTest
    totalPredict = NN_yTrainPred + NN_yTestPred

    mse_NN = mean_squared_error(totalReport,totalPredict)

    r2_NN = r2_score(totalReport,totalPredict)
    

    fig, axes =  plt.subplots(2, 4, figsize=(25,10))


    # Plot Ridge Regression scatterplots
    axes[0][0].scatter(RR_y_train, RR_train_pred, c='#F8A802', label='Train')
    axes[0][0].scatter(RR_y_test, RR_test_pred, c='#010101', label='Test')
    axes[0][0].title.set_text('Ridge Regression')
    axes[0][0].plot(np.arange(12,21), np.arange(12, 21), ls="--", alpha=0.4, color="k")
    axes[0][0].text(0.7, 0.18, f"MSE: {mse_RR:.3f}", transform=axes[0][0].transAxes)
    axes[0][0].text(0.7, 0.1, f"$R^2$: {r2_RR:.3f}", transform=axes[0][0].transAxes)
    axes[0][0].legend()

    #Plot Bayesian Ridge Regression scatterplots
    axes[0][1].errorbar(BR_y_train, BR_train_pred, yerr = BR_trainSD, fmt="o", c='#EF1006', label='Train')
    axes[0][1].errorbar(BR_y_test, BR_test_pred, yerr = BR_testSD, fmt="o", c='#010101', label='Test')
    axes[0][1].title.set_text('Bayesian Ridge Regression')
    axes[0][1].plot(np.arange(12,21), np.arange(12, 21), ls="--", alpha=0.4, color="k")
    axes[0][1].text(0.7, 0.18, f"MSE: {mse_BR:.3f}", transform=axes[0][1].transAxes)
    axes[0][1].text(0.7, 0.1, f"$R^2$: {r2_BR:.3f}", transform=axes[0][1].transAxes)
    axes[0][1].legend()

    #Plot Support Vector Regression
    axes[0][2].scatter(SVR_y_train, SVR_train_pred, c='#FB05AF', label='Train')
    axes[0][2].scatter(SVR_y_test, SVR_test_pred, c='#010101', label='Test')
    axes[0][2].title.set_text('Support Vector Regression')
    axes[0][2].plot(np.arange(12,21), np.arange(12, 21), ls="--", alpha=0.4, color="k")
    axes[0][2].text(0.7, 0.18, f"MSE: {mse_SVR:.3f}", transform=axes[0][2].transAxes)
    axes[0][2].text(0.7, 0.1, f"$R^2$: {r2_SVR:.3f}", transform=axes[0][2].transAxes)
    axes[0][2].legend()

    #Plot K-Nearest Neighbors Regression
    axes[0][3].scatter(KNN_y_train, KNN_train_pred, c='#4305FB', label='Train')
    axes[0][3].scatter(KNN_y_test, KNN_test_pred, c='#010101', label='Test')
    axes[0][3].title.set_text('K-Nearest Neighbors Regression')
    axes[0][3].plot(np.arange(12,21), np.arange(12, 21), ls="--", alpha=0.4, color="k")
    axes[0][3].text(0.7, 0.18, f"MSE: {mse_KNN:.3f}", transform=axes[0][3].transAxes)
    axes[0][3].text(0.7, 0.1, f"$R^2$: {r2_KNN:.3f}", transform=axes[0][3].transAxes)
    axes[0][3].legend()

    #Plot Random Forest Regression 
    axes[1][0].scatter(RFR_y_train, RFR_train_pred, c='#11A7D3', label='Train')
    axes[1][0].scatter(RFR_y_test, RFR_test_pred, c='#010101', label='Test')
    axes[1][0].title.set_text('Random Forest Regression')
    axes[1][0].plot(np.arange(12,21), np.arange(12, 21), ls="--", alpha=0.4, color="k")
    axes[1][0].text(0.7, 0.18, f"MSE: {mse_RFR:.3f}", transform=axes[1][0].transAxes)
    axes[1][0].text(0.7, 0.1, f"$R^2$: {r2_RFR:.3f}", transform=axes[1][0].transAxes)
    axes[1][0].legend()

    # Plot Gradient Boosting Regression
    axes[1][1].scatter(GBR_y_train, GBR_train_pred, c='#22CA39', label='Train')
    axes[1][1].scatter(GBR_y_test, GBR_test_pred, c='#010101', label='Test')
    axes[1][1].title.set_text('Gradient Boosting Regression')
    axes[1][1].title.set_text('Gradient Boosting Regression')
    axes[1][1].plot(np.arange(12,21), np.arange(12, 21), ls="--", alpha=0.4, color="k")
    axes[1][1].text(0.7, 0.18, f"MSE: {mse_GBR:.3f}", transform=axes[1][1].transAxes)
    axes[1][1].text(0.7, 0.1, f"$R^2$: {r2_GBR:.3f}", transform=axes[1][1].transAxes)
    axes[1][1].legend()

    # Plot Gaussian Process Regression
    axes[1][2].errorbar(GPR_y_train, GPR_train_pred, fmt="o", yerr = trainSD, c='#1C6970', label='Train')
    axes[1][2].errorbar(GPR_y_test, GPR_test_pred, fmt="o",yerr = testSD, c='#010101', label='Test')
    axes[1][2].title.set_text('Gaussian Process Regression')
    axes[1][2].plot(np.arange(12,21), np.arange(12, 21), ls="--", alpha=0.4, color="k")
    axes[1][2].text(0.7, 0.18, f"MSE: {mse_GPR:.3f}", transform=axes[1][2].transAxes)
    axes[1][2].text(0.7, 0.1, f"$R^2$: {r2_GPR:.3f}", transform=axes[1][2].transAxes)
    axes[1][2].legend()


    # Plot Gradient Boosting Regression
    axes[1][3].scatter(NN_yTrain, NN_yTrainPred, c='#22CA39', label='Train')
    axes[1][3].scatter(NN_yTest, NN_yTestPred, c='#010101', label='Test')
    axes[1][3].title.set_text('Neural Network')
    axes[1][3].plot(np.arange(12,21), np.arange(12, 21), ls="--", alpha=0.4, color="k")
    axes[1][3].text(0.7, 0.18, f"MSE: {mse_NN:.3f}", transform=axes[1][3].transAxes)
    axes[1][3].text(0.7, 0.1, f"$R^2$: {r2_NN:.3f}", transform=axes[1][3].transAxes)
    axes[1][3].legend()

    fig.text(0.5, 0.04, 'Observed column density ($\log_{10}$ cm$^{-2}$)', ha='center')
    fig.text(0.07, 0.5, 'Predicted column density ($\log_{10}$ cm$^{-2}$)', va='center', rotation='vertical')
