In [37]:
import pandas as pd
import numpy as np
import pickle
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

# model
from sklearn.cross_decomposition import PLSRegression

# math
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from math import sqrt, isnan
from sklearn.metrics import r2_score

fp = "G:\\My Drive\\Darby Work\\Ytsma and Dyar 2021 (LOD paper)\\"

Use 0-750 model, ChemLIBS instrument, Mars atmosphere, IQR outliers, Braga sensitivity.
#### Compositions

In [20]:
comps = pd.read_csv(fp+'tables\\TableS1_MHC_sample_compositions.csv')
# filter for test samples (random number > 750)
comps = comps[comps['Random Number']>750].drop(columns='Random Number').reset_index(drop=True)

#### LOQs

In [25]:
loqs = pd.read_csv(fp+'train_results.csv')
loqs = loqs[(loqs.num_range=='0-750')&
            (loqs.normalization=='norm3')&
            (loqs.outlier_defn=='iqr_q3')&
            (loqs.atmosphere=='Mars')&
            (loqs.instrument=='CL')&
            (loqs.sens_method=='braga')]
loqs = dict(zip(loqs.element, loqs.loq))

#### IQR outlier limits

In [19]:
outlier_limits = pd.read_csv('Z:\\Millennium Set\\NEW_OUTLIER_LIMITS.csv')
outlier_limits = dict(zip(outlier_limits.element, outlier_limits.iqr_q3_outlier_limit))

#### Spectra

In [6]:
spectra = pd.read_csv(fp+'datasets\\CL_Mars_norm3_spectra_matched.csv')

### Test models

In [33]:
# make dictionary of 'Actual' uncertainties
unc_path = fp + "BureauVeritasuncertainties_forpaper.csv"
uncert = pd.read_csv(unc_path)
uncert = pd.Series(uncert.uncertainty.values, index=uncert.element).to_dict()

In [49]:
n_all_list = []
n_loq_list = []
rmsep_all_list = []
rmsep_loq_list = []
r2_all_list = []
r2_loq_list = []
elem_list = []
loq_list = []
n_range_list = []

for n_range in ['0-750', '250-1000']:
    for elem_col in comps.columns[1:]:

        element = elem_col.split()[0]
        out_lim = outlier_limits[element]

        # get model
        model = pickle.load(open(fp+'python_models\\norm3\\CL_Mars\\'+n_range+'\\iqr_q3\\'+element+'_model.asc', 'rb'))

        # get test samples
        if isnan(out_lim):
            test_comps = comps[~comps[elem_col].isna()].reset_index(drop=True)[['Sample Name', elem_col]]
        else:
            test_comps = comps[comps[elem_col] <= out_lim].reset_index(drop=True)[['Sample Name', elem_col]]

        test_names = test_comps['Sample Name']
        y_test = test_comps[elem_col]
        X_test = spectra[test_names].copy(deep=True)

        spec_list = []
        for column in X_test.columns:
            spectrum = list(X_test[column])
            spec_list.append(spectrum)
        X_test = np.array(spec_list)

        # run predictions
        test_pred = model.predict(X_test)

        test_pred_true = pd.DataFrame({
            'sample' : test_names,
            'actual' : y_test,
            'pred' : test_pred.flatten().tolist()
        })

        if len(element) > 2:
            # remove any above 100 wt%
            test_pred_true = test_pred_true[(test_pred_true.pred < 100)].copy(deep=True)
        # remove below zero
        test_pred_true = test_pred_true[(test_pred_true.pred >= 0)].copy(deep=True)

        # Get info for all
        n_all = len(test_pred_true)
        # get RMSE-P
        rmsep_all = sqrt(mean_squared_error(test_pred_true.actual, test_pred_true.pred))
        # get R2
        r2_all = r2_score(test_pred_true.actual,test_pred_true.pred)
        # adjusted r2
        adj_r2_all = 1 - (1-r2_all)*(len(test_pred_true) - 1) / (len(test_pred_true) - (test_pred_true.shape[1] - 1) - 1)

        # Remove below LOQ
        loq = loqs[element]
        temp = test_pred_true[test_pred_true.pred >= loq].copy(deep=True).reset_index(drop=True)
        n_loq = len(temp)
        # get RMSE-P
        rmsep_loq = sqrt(mean_squared_error(temp.actual, temp.pred))
        # get R2
        r2_loq = r2_score(temp.actual,temp.pred)
        # adjusted r2
        adj_r2_loq = 1 - (1-r2_loq)*(len(temp) - 1) / (len(temp) - (temp.shape[1] - 1) - 1)

        elem_list.append(element)
        n_range_list.append(n_range)
        loq_list.append(loq)
        n_all_list.append(n_all)
        n_loq_list.append(n_loq)
        rmsep_all_list.append(rmsep_all)
        rmsep_loq_list.append(rmsep_loq)
        r2_all_list.append(adj_r2_all)
        r2_loq_list.append(adj_r2_loq)
        
        if n_range == '0-750':
            #-----------MAKE PLOT-----------#
            xlab = "True "+elem_col
            ylab = "Predicted "+elem_col
            xdata = test_pred_true.actual
            ydata = test_pred_true.pred
            xerr = uncert[element]

            ddash = (0, (10, 2))
            c_p = "black"
            c_g = "darkgrey"
            c_e = "#bfbfbf"
            linew = 0.5
            line_c = 'white'
            lwidth = 1.5
            cap = 2.5
            xsize = 20
            klen = 5
            tsize = 16

            fig, ax = plt.subplots(1, figsize = (6,6))
            # plot data
            ax.scatter(xdata, ydata, c=c_p, linewidth = linew, edgecolors = line_c, zorder=2)
            # error bars
            ax.errorbar(x=xdata, y=ydata, c=c_e, xerr=xerr, fmt = '.', capsize=cap, zorder=1)
            # 1:1 line
            y_bot, y_top = plt.ylim() # get current auto-axes
            x_bot, x_top = plt.xlim()
            # make sure LOQ not below edge
            top = max(y_top, x_top, loq+(0.1*loq))
            bot = min(y_bot, x_bot, 0)
            plt.xlim(x_bot, x_top) # reset to original axes
            plt.ylim(y_bot, y_top)
            one_one = ax.plot([bot,top], [bot,top], c=c_p, linewidth=lwidth, linestyle = ddash, label = "1:1")

            # plot lines
            zero = ax.axhline(y=0, xmin=0, xmax=1, c=c_p, linewidth=1)
            loq_line = ax.axhline(y=loq, xmin = 0, xmax = 1, c='darkred', linewidth=lwidth, label = "LOQ")

            # format
            ax.set_xlabel(xlab, fontsize = xsize)
            ax.set_ylabel(ylab, fontsize = xsize)
            ax.tick_params(length=klen, labelsize=tsize)

            plt.tight_layout()
            plt.savefig(fp + "figures\\test pred true\\"+element+"_pred_true.eps", dpi=600)
            plt.close(fig)
    
results = pd.DataFrame({
    'element':elem_list,
    'n_range':n_range_list,
    'loq':loq_list,
    'n_all':n_all_list,
    'n_loq':n_loq_list,
    'rmsep_all':rmsep_all_list,
    'rmsep_loq':rmsep_loq_list,
    'adj_r2_all':r2_all_list,
    'adj_r2_loq':r2_loq_list
})

results.to_csv(fp+'test_results.csv', index=False)

In [71]:
r = results.groupby(['element'], as_index=False)
avg = r.mean()
std = r.std()
sd_list = [i + '_sd' for i in std.columns[2:]]
sd_list.insert(0, 'element')
sd_list.insert(1, 'loq')
std.columns = sd_list
std.drop(columns='loq', inplace=True)
avg_results = pd.merge(avg, std)
avg_results.to_csv(fp+'average_test_results.csv', index=False)