In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm.notebook import tqdm

# model
from sklearn.cross_decomposition import PLSRegression

# math
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from math import sqrt, isnan
from sklearn.metrics import r2_score

fp = "G:\\My Drive\\Darby Work\\Ytsma and Dyar 2021 (LOD paper)\\"

#### Compositions

In [10]:
# get comps
mhc_comps = pd.read_csv(fp+'tables\\TableS1_MHC_sample_compositions.csv')
lanl_comps = pd.read_csv(fp+'tables\\TableS2_LANL_sample_compositions.csv')

#### Datasets (baseline removal applied)

In [3]:
dp = fp + 'datasets//'

# same 205 standards
cc_mars = pd.read_csv(dp+'CC_norm3_spectra.csv')
cl_mars_cc = pd.read_csv(dp+'CL_Mars_norm3_spectra_CC_matched.csv')
cc_mars_unnorm = pd.read_csv(dp+'CC_unnorm_spectra.csv')
cl_mars_cc_unnorm = pd.read_csv(dp+'CL_Mars_unnorm_spectra_CC_matched.csv')

# same 2607 standards
cl_mars = pd.read_csv(dp+'CL_Mars_norm3_spectra_matched.csv')
cl_earth = pd.read_csv(dp+'CL_Earth_norm3_spectra_matched.csv')
cl_vac = pd.read_csv(dp+'CL_Vacuum_norm3_spectra_matched.csv')
cl_mars_unnorm = pd.read_csv(dp+'CL_Mars_unnorm_spectra_matched.csv')
cl_earth_unnorm = pd.read_csv(dp+'CL_Earth_unnorm_spectra_matched.csv')
cl_vac_unnorm = pd.read_csv(dp+'CL_Vacuum_unnorm_spectra_matched.csv')

#### Sensitivities

In [4]:
sensitivities = pd.read_csv(fp+'instrument_sensitivities.csv')

#### Outlier limits
Calculated by 1.5*IQR + Q3 on entire MHC dataset or highest natural sample for doped elements

In [5]:
outlier_limits = pd.read_csv('Z:\\Millennium Set\\NEW_OUTLIER_LIMITS.csv')
iqr_outliers = dict(zip(outlier_limits.element, outlier_limits.iqr_q3_outlier_limit))
dope_outliers = dict(zip(outlier_limits.element, outlier_limits.highest_natural_for_doped))

#### Make models per element

In [6]:
# prepare for reading into loops
normed_dfs = [cl_mars, cl_earth, cl_vac, cc_mars, cl_mars_cc]
unnorm_dfs = [cl_mars_unnorm, cl_earth_unnorm, cl_vac_unnorm, cc_mars_unnorm, cl_mars_cc_unnorm]
atms = ['Mars', 'Earth', 'Vacuum', 'Mars', 'Mars']
insts = ['CL', 'CL', 'CL', 'CC', 'CL_CC']
elements = ['MnO', 'Na2O', 'SiO2', 'Li', 'Ni', 'Pb', 'Rb', 'Sr', 'Zn']

In [None]:
# PLS parameters
n_folds = 5
max_components = 30

# prepare data lists
norm_list = []
atm_list = []
inst_list = []
n_range_list = []
element_list = []
outlier_list = []
sens_list = []
n_train_list = []
rmsecv_list = []
component_list = []
loq_list = []

#for norm in tqdm(['unnorm', 'norm3'], desc='norm'):
    
    #datasets=unnorm_dfs if norm=='unnorm' else normed_dfs # TEMP CHANGE FROM FIXING ERROR
norm = 'norm3'
datasets=normed_dfs
count=0

for dataset in tqdm(datasets, desc='dataset', leave=False): 

    atm = atms[count]
    inst = insts[count]
    count+=1

    # select correct composition file
    comps=lanl_comps if 'CC' in inst else mhc_comps

    for n_range in tqdm(['0-750', '250-1000'], desc='n_range', leave=False):

        # filter dataset
        if n_range == '0-750':
            all_comps = comps[comps['Random Number'] <= 750].copy(deep=True)
        elif n_range == '250-1000':
            all_comps = comps[comps['Random Number'] >= 250].copy(deep=True)

        for element in tqdm(elements, desc='element', leave=False):

            # get column name
            elem_col = [i for i in all_comps.columns if element in i][0]

            for outlier in tqdm(['iqr_q3', 'high_natl'], desc='outlier', leave=False):

                # define path to save results
                outpath = "{}python_models\\{}\\{}_{}\\{}\\{}\\".format(fp,norm,inst,atm,n_range,outlier)

                out=iqr_outliers if outlier=='iqr_q3' else dope_outliers
                out_lim = out[element]

                if isnan(out_lim):
                    train_comps = all_comps[~all_comps[elem_col].isna()].reset_index(drop=True)[['Sample Name', elem_col]]
                else:
                    train_comps = all_comps[all_comps[elem_col] <= out_lim].reset_index(drop=True)[['Sample Name', elem_col]]

                # prepare data for modelling
                y_train = train_comps[elem_col].values
                n_train = len(y_train)

                if n_train < n_folds:
                    n_folds = n_train

                X_train = dataset[list(train_comps['Sample Name'])]
                spec_list = []
                for column in X_train.columns:
                    spectrum = list(X_train[column])
                    spec_list.append(spectrum)
                X_train = np.array(spec_list)

                # cross validation and model training
                cv_dict = {}
                for n_components in np.arange(start=2, stop=max_components+1, step=1):
                    temp_pls = PLSRegression(n_components = n_components, scale=False)
                    temp_rmsecv = (-cross_val_score(
                        temp_pls, X_train, y_train, cv=n_folds, scoring='neg_root_mean_squared_error'
                    )).mean()
                    cv_dict.update({temp_rmsecv : n_components})

                # select parameters of model with lowest rmsecv
                rmsecv = min(list(cv_dict.keys()))
                component = cv_dict[rmsecv]
                model = PLSRegression(n_components = component, scale=False)

                model.fit(X_train, y_train)
                pickle.dump(model, open(outpath+element+'_model.asc', 'wb'), protocol=0)

                coeff = pd.DataFrame(model.coef_)
                coeff.to_csv(outpath+element+'_coeffs.csv', index=False)

                for sens in ['braga', 'metals']:

                    if inst == 'CL_CC':
                        sensitivity = sensitivities[
                            (sensitivities.instrument == 'CL') &
                            (sensitivities.atmosphere == atm) &
                            (sensitivities.normalization == norm) &
                            (sensitivities.method == sens)
                        ]['sensitivity'].iloc[0]
                    else:
                        sensitivity = sensitivities[
                            (sensitivities.instrument == inst) &
                            (sensitivities.atmosphere == atm) &
                            (sensitivities.normalization == norm) &
                            (sensitivities.method == sens)
                        ]['sensitivity'].iloc[0]                            

                    # calculate LOQ
                    vector = pow(coeff, 2).sum().pow(.5)
                    loq = 10 * sensitivity * vector[0]

                    # descriptive
                    norm_list.append(norm)
                    atm_list.append(atm)
                    inst_list.append(inst)
                    n_range_list.append(n_range)
                    element_list.append(element)
                    outlier_list.append(outlier)
                    sens_list.append(sens)
                    # unique values
                    n_train_list.append(n_train)
                    rmsecv_list.append(rmsecv)
                    component_list.append(component)
                    loq_list.append(loq)

dataset:   0%|          | 0/5 [00:00<?, ?it/s]

n_range:   0%|          | 0/2 [00:00<?, ?it/s]

element:   0%|          | 0/9 [00:00<?, ?it/s]

outlier:   0%|          | 0/2 [00:00<?, ?it/s]

outlier:   0%|          | 0/2 [00:00<?, ?it/s]

outlier:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
results = pd.DataFrame({
    'element':element_list,
    'instrument':inst_list,
    'atmosphere':atm_list,
    'normalization':norm_list,
    'outlier_defn':outlier_list,
    'sens_method':sens_list,
    'num_range':n_range_list,
    'n_train':n_train_list,
    'rmsecv':rmsecv_list,
    'components':component_list,
    'loq':loq_list
})

results.to_csv(fp+'train_results_020422.csv', index=False)

In [None]:
r = results.groupby(['element', 
                     'normalization',
                     'outlier_defn', 
                     'atmosphere',
                     'instrument',
                     'sens_method'], as_index=False)

avg = r.mean()
std = r.std()

In [None]:
fcols = list(std.columns[:6])
sd_list = [i + '_sd' for i in std.columns[6:]]
for i in [0,1,2,3,4,5]:
    sd_list.insert(i, fcols[i])
std.columns = sd_list
avg_results = pd.merge(avg, std)
avg_results.to_csv(fp+'average_train_results_020422.csv', index=False)