In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

folder = 'H:\\My Drive\\PROJECTS\\PSI 2022-2025\\XRF fundamentals vs. MVA'

## FP

In [3]:
# ACTUAL VALUES
meta = pd.read_csv(folder+'\\data\\meta_both.csv')
#format
meta.drop(columns='Sample_Name',inplace=True)
to_drop = [x for x in meta.columns if 'Folds' in x]
meta.drop(columns=to_drop, inplace=True)
# get elements
element_list = list(meta.columns[1:])
element_list.remove('Hg')
# continue formatting
new = [x+'_actual' for x in meta.columns[1:]]
new.insert(0,'pkey')
meta.columns=new

# PREDICTED VALUES
fp = pd.read_csv(folder+'\\instrument_predictions.csv')
# format
fp = fp.groupby('pellet_name', as_index=False).mean()
to_drop = [x for x in fp.columns if '+/-' in x]
fp.drop(columns=to_drop, inplace=True)
fp.columns = [x.split(' ')[0] for x in fp.columns]
new = [x+'_pred' for x in fp.columns[1:]]
new.insert(0,'pkey')
fp.columns=new

# merge
fp_full = meta.merge(fp)
#format
cols = list(fp_full.columns[1:])
cols.sort()
cols.insert(0,'pkey')
fp_full = fp_full[cols]

rmsep_list=[]
r2_list=[]
adj_r2_list=[]
n_list=[]

for element in element_list:
    
    p = f'{element}_pred'
    t = f'{element}_actual'
    temp = fp_full[(~fp_full[p].isna())&(~fp_full[t].isna())].copy()
    n = len(temp)
    n_list.append(n)
    
    # RMSEP
    rmsep = sqrt(mean_squared_error(temp[t], temp[p]))
    rmsep_list.append(rmsep)
    
    # R2
    r2 = r2_score(temp[t], temp[p])
    adj_r2 = 1 - (1-r2)*(len(temp) - 1) / (len(temp) - (temp.shape[1] - 1) - 1)
    r2_list.append(r2)
    adj_r2_list.append(adj_r2)
    
fp_results = pd.DataFrame({
    'element':element_list,
    'n_test':n_list,
    'rmsep':rmsep_list,
    'r2':r2_list,
    'adj_r2':adj_r2_list
})
fp_results.to_csv(folder+'\\FP_RMSEP_results.csv', index=False)

## MVA

In [15]:
# using PLS because best %RMSEP results
model = 'PLS'
# using both filters bc lowest errors
filt = 'both'

sens_df = pd.read_csv(folder+'\\sensitivities.csv')
sens = sens_df[sens_df['filter']==filt]['median'][0]

info = pd.read_csv(folder+'\\data\\dataset_summary.csv')
element_list = info.element.values

loq_list=[]
rmsep_list=[]
r2_list=[]
adj_r2_list=[]

pt_full = meta[['pkey']] # to make cumulative pred/true

for element in element_list:
    
    # LOQ
    coeff = pd.read_csv(f'{folder}\\models\\{filt}\\{element}_{model}_coefs.csv')
    vector = pow(pow(coeff['coef'], 2).sum(),0.5)  #square root of sum of squares
    loq = 10 * vector * sens
    loq_list.append(loq)
    
    # adjusted RMSEP
    p = f'{element}_pred'
    t = f'{element}_actual'
    p_t = pd.read_csv(f'{folder}\\models\\{filt}\\{element}_{model}_test_pred_true.csv')
    p_t = p_t[p_t[p]>loq] # remove those below LOQ
    pt_full = pt_full.merge(p_t, how='left') # add to full table
    
    rmsep = sqrt(mean_squared_error(p_t[t], p_t[p]))
    rmsep_list.append(rmsep)
    
    # R2
    r2 = r2_score(p_t[t], p_t[p])
    adj_r2 = 1 - (1-r2)*(len(p_t) - 1) / (len(p_t) - (p_t.shape[1] - 1) - 1)
    r2_list.append(r2)
    adj_r2_list.append(adj_r2)
    
mva_results = pd.DataFrame({
    'element':element_list,
    'loq':loq_list,
    'rmsep':rmsep_list,
    'r2':r2_list,
    'adj_r2':adj_r2_list
})
mva_results.to_csv(folder+'\\MVA_LOQ_RMSEP_results.csv', index=False)
pt_full.to_csv(folder+'\\MVA_test_predictions.csv', index=False)

## compare

In [4]:
# format to compare
cols=['MVA_'+x for x in mva_results.columns[1:]]
cols.insert(0,'element')
mva_results.columns=cols

cols=['FP_'+x for x in fp_results.columns[1:]]
cols.insert(0,'element')
fp_results.columns=cols

df = mva_results.merge(fp_results)

In [5]:
df.head()

Unnamed: 0,element,MVA_loq,MVA_rmsep,MVA_r2,MVA_adj_r2,FP_n_test,FP_rmsep,FP_r2,FP_adj_r2
0,SiO2,0.052299,4.44539,0.886155,0.885564,1938,13.532021,-0.067573,-0.099941
1,TiO2,0.009115,0.538444,0.676496,0.674748,1894,0.648947,0.656393,0.645726
2,Al2O3,0.017246,3.139423,0.151046,0.146494,1934,3.5612,0.307327,0.28628
3,Fe2O3,0.010774,1.473955,0.906742,0.906251,1927,2.111104,0.849487,0.844896
4,MgO,0.017401,2.732759,0.608159,0.605639,1931,4.348753,0.592624,0.580227
