In [1]:
import pandas as pd
from tqdm.notebook import tqdm

# get version
print("pandas version",pd.__version__)

fp = "G:\\My Drive\\Darby Work\\XRF fundamentals vs. MVA\\"

pandas version 1.3.2


In [2]:
spectra_path = "Z:\\data_pXRF\\MHC_Olympus_spectra.csv"
spectra = pd.read_csv(spectra_path)

metadata_path = "Z:\\data_pXRF\\MHC_Olympus_metadata.csv"
metadata = pd.read_csv(metadata_path)

In [3]:
# select samples with olympus predictions
pred_samples = list(metadata[
    metadata['Olympus-Predicted'] == 'yes'
]['pkey'])

# add wave column
pred_samples.insert(0, 'wave')


pred_spectra = spectra.filter(
    items = pred_samples
)

In [5]:
o1_spectra = pred_spectra.filter(regex=('(_1$)|(wave)'), axis=1)
o2_spectra = pred_spectra.filter(regex=('(_2$)|(wave)'), axis=1)

## Sensitivity

In [4]:
noise_path = fp+"Braga noise regions.csv"
noise = pd.read_csv(noise_path)

In [6]:
print("Datasets cleaned:")
count = 0
for df in tqdm([o1_spectra, o2_spectra]):
   
    if count == 0:
        noise_temp = noise[noise['Filter'] == 1].reset_index(drop=True)
    else: noise_temp = noise[noise['Filter'] == 2].reset_index(drop=True)
    count += 1
        
    print("Rows cleaned:") 
    for row in tqdm(df.index):
        nm = df['wave'][row]
        # remove rows below first region
        if nm < noise_temp['Start'][0]:
            df.drop(row, axis = 'index', inplace=True)
        # remove rows after last region
        elif nm > noise_temp['Stop'][len(noise_temp)-1]:
            df.drop(row, axis = 'index', inplace=True)
        # remove rows between the regions
        for region in range(len(noise_temp)-1):
            if (nm > noise_temp['Stop'][region]) & (nm < noise_temp['Start'][region+1]):
                df.drop(row, axis = 'index', inplace=True)

Datasets cleaned:


  0%|          | 0/2 [00:00<?, ?it/s]

Rows cleaned:


  0%|          | 0/2042 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Rows cleaned:


  0%|          | 0/2042 [00:00<?, ?it/s]

In [7]:
o1_sens = round(o1_spectra.set_index('wave').std(axis=1).mean(),3)
o2_sens = round(o2_spectra.set_index('wave').std(axis=1).mean(),3)

In [8]:
print("Filter 1 sensitivity:", o1_sens, 
      "\nFilter 2 sensitivity:",o2_sens)

Filter 1 sensitivity: 261.024 
Filter 2 sensitivity: 159.383


### Function to calculate LBDQs from model and instrument sensitivity

In [None]:
def get_lbdq(folder, file_list):
    
    print("LBDQ:")
    
    for filter_n in ['O1', 'O2']:
        
        # get sensitivity value
        sensitivity = o1_sens if filter_n == 'O1' else o2_sens
        
        coeffs = []
        ftype = filter_n + "_coeff"

        # read models
        for file in tqdm(file_list):
            if ftype in file:       
                path = folder + file
                data = pd.read_csv(path, skiprows = [0])
                coeffs.append(data)

        # convert to dataframe
        coeffs = pd.concat(coeffs).T

        # adapt to different element naming b/w datasets
        coeffs.columns = coeffs.iloc[0].map(lambda x: x.split()[0])

        coeffs = coeffs.drop(coeffs.index[0])

        # calculate regression vectors
        vector_list = coeffs.pow(2).sum().pow(.5)  #square root of sum of squares

        # populate lists
        elem_list = coeffs.columns
        filt_list = list(filter_n * len(elem_list))

        df = pd.DataFrame({'element' : elem_list,
                           'vector' : vector_list,
                           'filter' : filt_list
        }).reset_index(drop = True)

        # calculate values
        types = ['LOB', 'LOD', 'LOQ']
        factors = [1.645, 3.3, 10]

        for i in range(len(types)):
            df[types[i]] = factors[i] * sensitivity * df['vector']

        # change col formats
        cols = df.columns.drop('element')
        df[cols] = df[cols].apply(pd.to_numeric)
        df['element'] = df['element'].astype(str)

        return df

### Function to calculate RMSEPs

In [None]:
majors = ['Al2O3', 'CaO', 'Fe2O3', 'MgO', 'MnO', 'P2O5', 'SiO2', 'TiO2']

def get_rmsep(folder, file_list, comps, filter_n lbdq):
    
    print("RMSEP:")
    
    elem_list = []
    avg_list = []
    rmsep_list = []
    r2_list = []
    
    ftype = filter_n + "_test"
    
    for file in tqdm(file_list):
        if ftype in file:       
            path = (folder + file)
            data = pd.read_csv(path)
            
            # get element
            element = data.columns[1].split()[0]
            elem_list.append(element)
            
            # format columns
            data.columns = ['pkey', 'Actual', 'Pred']
            data = data.drop([0])
            data.Pred = data.Pred.astype(float)  
            
            # remove predictions above 100 for majors
            if element in majors:
                data = data[data.Pred < 100]
                
            # remove all predictions below 0
            data = data[data.Pred > 0].reset_index(drop=True).sort_index(axis=1)
                   
            # order columns
            data = data[['pkey', 'Actual', 'Pred']].drop_duplicates(subset = 'pkey').sort_values(by='pkey').reset_index(drop=True)
               
            # subselect relevant reference values
            ref = lbdq[lbdq.element == element].reset_index(drop=True)

            # add in Actual concentrations
            temp_comps = comps[comps.pkey.isin(data.pkey)].reset_index(drop=True) 
            # note and remove samples that don't have composition info
            no_comps = data[~data.pkey.isin(temp_comps.pkey)]
            if len(no_comps) > 0:
                no_comps_list.append(list(no_comps.pkey)) # add to list
                data = pd.concat([data, no_comps]).drop_duplicates(keep=False).reset_index(drop=True)
            
            data['Actual'] = temp_comps[temp_comps['pkey'] == data['pkey']][element]
            
            # remove NaN Actual values....which idk why they'd be there
            data = data.dropna()
            
            # calculate values
            loq = ref['LOQ'].iloc[0]
            # select just predictions above the LOQ
            data = data[data.Pred > loq].reset_index(drop=True)
            # get average concentration
            avg = data['Actual'].mean()
            avg_list.append(avg)
            # get R2
            r2 = r2_score(data.Actual, data.Pred)
            r2_list.append(r2)
            # get RMSE-P
            data['sqerror'] = (data.Actual - data.Pred).pow(2)
            rmsep = data['sqerror'].mean() ** 0.5
            rmsep_list.append(rmsep)

    
    df = pd.DataFrame({
        "element" : elem_list,
        "Avg" : avg_braga_list,
        "RMSEP" : rmsep_braga_list,
        "R2" : r2_braga_list,
    })
    
    # give list of samples without comps
    no_comps_list = [item for sublist in no_comps_list for item in sublist]
    no_comps_list = list(set(no_comps_list))
    if len(no_comps_list) > 0: print("Sample(s)", str(no_comps_list), "have no composition info and were removed")
    
    return df

In [None]:
def get_results(regression, n_range):
    
    print('Calculating for', regression, n_range)

    folder = fp+"\\models\\"+regression+"\\"+n_range+"\\"
    file_list = os.listdir(folder)

    # calculate lbdq
    lbdq = get_lbdq(folder, file_list)
    # calculate rmsep with lbdq results
    rmsep = get_rmsep(folder, file_list, comps, lbdq)
    # merge results
    df = pd.merge(lbdq, rmsep, how='outer', on='element')
    df.insert(loc=2, column='num_range', value=n_range)
    # return full results
    return df 