In [3]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import r2_score

In [6]:
# generate comps
comps_path = "C:\\Users\\ytsma22c\\Google Drive\\Darby Work\\LOD paper\\tables\\TableS1_sample_compositions.xlsx"
lanl_comps = pd.read_excel(comps_path, sheet_name = "LANL")
mhc_comps = pd.read_excel(comps_path, sheet_name = "MHC")
comps = pd.merge(mhc_comps, lanl_comps, how = "outer") # merge comps
comps.columns = comps.columns.map(lambda x: x.split()[0])
comps = comps.drop_duplicates(subset = 'Sample') # remove duplicates
comps['Sample'] = comps['Sample'].astype(str)
comps = comps.sort_values(by='Sample')
comps = comps.replace(np.nan, "", regex=True)
cols = comps.columns.drop('Sample')
comps[cols] = comps[cols].apply(pd.to_numeric) # make columns numeric

# make dictionary of spectrum names to sample names
key_path = "C:\\Users\\ytsma22c\\Google Drive\\Darby Work\\LOD paper\\ChemLIBS_spectrum_no_to_name.csv"
mhc_key = pd.read_csv(key_path)
mhc_key = pd.Series(mhc_key.Sample.values, index=mhc_key.pkey).to_dict()

In [4]:
# THIS CAN BE TURNED INTO ITS OWN FUNCTION
# make dictionary of sensitivity values per environment
sensitivity = {'LANL':{'Mars':{'braga':0.000245701982436336, 'metals':0.0000919578459982072}},
               'ChemLIBS':{'Mars':{'braga':0.00028197645372017, 'metals':0.000117639483634021},
               'Earth':{'braga':0.000444867993101603, 'metals':0.0000931716553399781},
               'Vacuum':{'braga':0.000417553099169204, 'metals':0.000231951753003813}}}

In [5]:
# function that calculates LBDQs from model and instrument sensitivity
def get_lbdq(folder, file_list, braga_sens, metals_sens):
    
    print("LBDQ:")
    
    coeffs = []

    # read models
    for file in tqdm(file_list):
        if "coeff" in file:       
            path = folder + file
            data = pd.read_csv(path, skiprows = [0])
            coeffs.append(data)

    # convert to dataframe
    coeffs = pd.concat(coeffs).T

    # adapt ot different element naming b/w datasets
    if coeffs.iloc[0].str.contains('Composition:').any():
        coeffs.columns = coeffs.iloc[0].map(lambda x: x.split(': ')[1])
    else: coeffs.columns = coeffs.iloc[0].map(lambda x: x.split()[0])

    coeffs = coeffs.drop(coeffs.index[0])

    # calculate regression vectors
    vector_list = coeffs.pow(2).sum().pow(.5)  #square root of sum of squares

    # populate lists
    elem_list = coeffs.columns


    df = pd.DataFrame({'element' : elem_list,
                         'vector' : vector_list
    }).reset_index(drop = True)

    # calculate values
    types = ['LOB', 'LOD', 'LOQ']
    factors = [1.645, 3.3, 10]

    for i in range(len(types)):
        df[types[i]+"_Braga"] = factors[i] * braga_sens * df['vector']
        df[types[i]+"_metals"] = factors[i] * metals_sens * df['vector']
        
    # change col formats
    cols = df.columns.drop('element')
    df[cols] = df[cols].apply(pd.to_numeric)
    df['element'] = df['element'].astype(str)

    return df

In [40]:
# function that calculates RMSEPs
def get_rmsep(folder, file_list, comps, lbdq):
    
    print("RMSEP:")
    
    elem_list = []
    avg_braga_list = []
    rmsep_braga_list = []
    r2_braga_list = []
    avg_metal_list = []
    rmsep_metal_list = []
    r2_metal_list = []
    no_comps_list = []
    
    for file in tqdm(file_list):
        if "test" in file:       
            path = (folder + file)
            data = pd.read_csv(path)
            
            # get element
            if "Composition:" in data.columns[1]:
                element = data.columns[1].split()[1]
            else: element = data.columns[1].split()[0]
            elem_list.append(element)
            
            # format columns
            data.columns = ['pkey', 'Actual', 'Pred']
            data = data.drop([0])
            data.Pred = data.Pred.astype(float)  
            
            # remove predictions above 100 for majors
            if element in ['SiO2', 'MnO', 'Na2O']:
                data = data[data.Pred < 100]
                
            # remove all predictions below 0
            data = data[data.Pred > 0].reset_index(drop=True).sort_index(axis=1)
            
            # rename ChemLIBS Spectrum names with sample names
            if data.pkey.str.contains('Spectrum').any():
                data = data.replace({'pkey': mhc_key})
            
            # format LANL spectra names to sample names
            else:
                data['pkey'] = data['pkey'].map(lambda x: x.split("_")[1])
                data['pkey'] = data['pkey'].map(lambda x: str(x).upper())
                   
            # order columns
            data = data[['pkey', 'Actual', 'Pred']].drop_duplicates(subset = 'pkey').sort_values(by='pkey').reset_index(drop=True)
               
            # subselect relevant reference values
            ref = lbdq[lbdq.element == element].reset_index(drop=True)

            # add in Actual concentrations
            temp_comps = comps[comps.Sample.isin(data.pkey)].reset_index(drop=True) 
            # note and remove samples that don't have composition info
            no_comps = data[~data.pkey.isin(temp_comps.Sample)]
            if len(no_comps) > 0:
                no_comps_list.append(list(no_comps.pkey)) # add to list
                data = pd.concat([data, no_comps]).drop_duplicates(keep=False).reset_index(drop=True)
            
            data['Actual'] = temp_comps[temp_comps['Sample'] == data['pkey']][element]
            
            # remove NaN Acutal values....which idk why they'd be there
            data = data.dropna()
            
           ###BRAGA###
            loq_braga = ref['LOQ_Braga'].iloc[0]
            # select just predictions above the LOQ
            braga = data[data.Pred > loq_braga].reset_index(drop=True)
            # get average concentration
            avg_braga = braga['Actual'].mean()
            avg_braga_list.append(avg_braga)
            # get R2
            r2_braga = r2_score(braga.Actual, braga.Pred)
            r2_braga_list.append(r2_braga)
            # get RMSE-P
            braga['sqerror'] = (braga.Actual - braga.Pred).pow(2)
            rmsep_braga = braga['sqerror'].mean() ** 0.5
            rmsep_braga_list.append(rmsep_braga)

            ##METALS###
            loq_metal = ref['LOQ_metals'].iloc[0]
            # select just predictions above the LOQ
            metal = data[data.Pred > loq_metal].reset_index(drop=True)
            # get average concentration
            avg_metal = metal['Actual'].mean()
            avg_metal_list.append(avg_metal)
            # get R2
            r2_metal = r2_score(metal.Actual, metal.Pred)
            r2_metal_list.append(r2_metal)
            # get RMSE-P
            metal['sqerror'] = (metal.Actual - metal.Pred).pow(2)
            rmsep_metal = metal['sqerror'].mean() ** 0.5
            rmsep_metal_list.append(rmsep_metal)
    
    df = pd.DataFrame({
        "element" : elem_list,
        "Avg_Braga" : avg_braga_list,
        "Avg_metals" : avg_metal_list,
        "RMSEP_Braga" : rmsep_braga_list,
        "RMSEP_metals" : rmsep_metal_list,
        "R2_Braga" : r2_braga_list,
        "R2_metals" : r2_metal_list
    })
    
    # give list of samples without comps
    no_comps_list = [item for sublist in no_comps_list for item in sublist]
    no_comps_list = list(set(no_comps_list))
    if len(no_comps_list) > 0: print("Sample(s)", str(no_comps_list), "have no composition info and were removed")
    
    return df

In [17]:
# function that returns values per environment
def get_results(instrument, atmosphere, n_range):
    
    print('Calculating for', instrument, atmosphere, n_range)
    
    braga_sens = sensitivity[instrument][atmosphere]['braga']
    metals_sens = sensitivity[instrument][atmosphere]['metals']

    folder = "C:\\Users\\ytsma22c\\Google Drive\\Darby Work\\LOD paper\\"+instrument+" calculations\\models\\"+atmosphere+"\\"+n_range+"\\"
    file_list = os.listdir(folder)

    # calculate lbdq
    lbdq = get_lbdq(folder, file_list, braga_sens, metals_sens)
    # calculate rmsep with lbdq results
    rmsep = get_rmsep(folder, file_list, comps, lbdq)
    # merge results
    df = pd.merge(lbdq, rmsep, how='outer', on='element')
    df.insert(loc=2, column='num_range', value=n_range)
    # return full results
    return df 

In [46]:
envs = [['LANL', 'Mars'],['ChemLIBS', 'Mars'],['ChemLIBS', 'Earth'],['ChemLIBS', 'Vacuum']]

for env in envs:
    # calculate results per model
    results_0_750 = get_results(env[0], env[1], '0-750')
    results_250_1000 = get_results(env[0], env[1], '250-1000')
    
    # get aggregate results
    detail_results = pd.concat([results_0_750, results_250_1000], ignore_index=True).drop(columns = 'vector')
    avg = results.groupby('element', as_index=False).mean()
    stdev = results.groupby('element', as_index=False).std()
    sd_list = [i + '_sd' for i in stdev.columns[1:]]
    sd_list.insert(0, 'element')
    stdev.columns = sd_list
    avg_results = pd.merge(avg, stdev, how='outer',on='element')
    
    # add environment information
    detail_results.insert(loc=1, column='instrument', value=env[0])
    detail_results.insert(loc=2, column='atmosphere', value=env[1])
    avg_results.insert(loc=1, column='instrument', value=env[0])
    avg_results.insert(loc=2, column='atmosphere', value=env[1])
    
    # update full table
    full_avg_results = avg_results if env == envs[0] else pd.concat([full_results, results], ignore_index=True)
    full_detail_results = detail_results if env == envs[0] else pd.concat([full_detail_results, detail_results], ignore_index=True)

Calculating for LANL Mars 0-750
LBDQ:


  0%|          | 0/37 [00:00<?, ?it/s]

RMSEP:


  0%|          | 0/37 [00:00<?, ?it/s]

Calculating for LANL Mars 250-1000
LBDQ:


  0%|          | 0/37 [00:00<?, ?it/s]

RMSEP:


  0%|          | 0/37 [00:00<?, ?it/s]

Calculating for ChemLIBS Mars 0-750
LBDQ:




  0%|          | 0/28 [00:00<?, ?it/s]

RMSEP:


  0%|          | 0/28 [00:00<?, ?it/s]

Calculating for ChemLIBS Mars 250-1000
LBDQ:


  0%|          | 0/28 [00:00<?, ?it/s]

RMSEP:


  0%|          | 0/28 [00:00<?, ?it/s]

Calculating for ChemLIBS Earth 0-750
LBDQ:


  0%|          | 0/28 [00:00<?, ?it/s]

RMSEP:


  0%|          | 0/28 [00:00<?, ?it/s]

Sample(s) ['M6', 'TR2660A', '33D3', 'TT1758D11', 'MIX660', 'EP3156', 'T12CT212A'] have no composition info and were removed
Calculating for ChemLIBS Earth 250-1000
LBDQ:


  0%|          | 0/28 [00:00<?, ?it/s]

RMSEP:


  0%|          | 0/28 [00:00<?, ?it/s]

Sample(s) ['M9A4', 'MIX383', '6D23', 'T12CT1302'] have no composition info and were removed
Calculating for ChemLIBS Vacuum 0-750
LBDQ:


  0%|          | 0/28 [00:00<?, ?it/s]

RMSEP:


  0%|          | 0/28 [00:00<?, ?it/s]

Sample(s) ['M6', 'TR2660A', '33D3', 'TT1758D11', 'MIX660', 'EP3156', 'T12CT212A'] have no composition info and were removed
Calculating for ChemLIBS Vacuum 250-1000
LBDQ:


  0%|          | 0/28 [00:00<?, ?it/s]

RMSEP:


  0%|          | 0/28 [00:00<?, ?it/s]

Sample(s) ['M9A4', 'MIX383', '6D23', 'T12CT1302'] have no composition info and were removed


In [47]:
# export
full_path = "C:\\Users\\ytsma22c\\Google Drive\\Darby Work\\LOD paper\\averaged_LOD_RMSEP_results.csv"
detail_path = "C:\\Users\\ytsma22c\\Google Drive\\Darby Work\\LOD paper\\detailed_LOD_RMSEP_results.csv"
full_avg_results.to_csv(full_path, index=False)
full_detail_results.to_csv(detail_path, index=False)