In [1]:
# --------------------------------------------------------------------------

# ----------------- use sample_selection functions

# -------------------------------------------------------------------------


# ¡¡¡ --- !!! # ---> modules and data cases

# --- system modules

import sys
import datetime
import os


base_dir = "/sample_selection_simpls"

# --- data handling modules

import numpy as np
import pandas as pd
import scipy.io as sp_io
import scipy as sp

# --- visualization modules

import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import rcParams
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# --- my modules

methods_dir = base_dir + '/methodology'  
sys.path.insert(0, methods_dir + '/model_building')
sys.path.insert(0, methods_dir + '/read_data')
sys.path.insert(0, methods_dir + '/sample_selection')
from class_chemometrics_data import chemometrics_data
from class_sample_selection import sample_selection
import simpls_module




# ¡¡¡ --- !!! # ---> base working directory and available data cases


# ************************************ init --- user 
cases_dict = {"d0001": ["d0001_corn", "d0001_data_prepared_01"]}
# ************************************ end --- user 


print("--------- imports loaded ----------")

experimentID = "exp001"


# ¡¡¡ --- !!! # ---> data


# ************************************ init --- user 
caseID_key = "d0001"
# ************************************ end --- user 

case_dir = cases_dict[caseID_key][0]
dname = cases_dict[caseID_key][1]
data_dir = '/data/' + case_dir + '/data_prepared/'





# ************************************ init --- user
data_class = chemometrics_data(base_dir + data_dir + dname + '.mat', 
                               data_identifier = data_dir + dname,
                               include_val = False,
                               include_test = True,
                               include_unlabeled = False,
                               y_all_range = False,
                               y_range = np.array([0]),
                               obs_all_cal = True,
                              shuffle = False)
# ************************************ end --- user



print(data_class.ncal, data_class.K)
print(data_class.get_test()["xtest"].shape)


print("--------- data loaded for " + data_class.data_identifier + "----------")




--------- imports loaded ----------
56 700
(24, 700)
--------- data loaded for /data/d0001_corn/data_prepared/d0001_data_prepared_01!*moisture*!----------


In [2]:
# --- factors


npc_range = list(range(1,15))
npc_range.append(data_class.K)
method_name = ["random","ks","duplex","puchwein","clustering","simplisma", "successive_projections", "shenkwest","honigs"]
sample_sizes = list(range(20,data_class.ncal,1))
total_lv = 15


In [8]:
# --- making design


design = []


        
for pc in npc_range:

    for method in method_name:

        for ss in sample_sizes:
            
            run = {"npc":pc,
                   "method_name":method,
                   "sample_size":ss,
                   "total_lv": total_lv}

            design.append(run)
                    
# pd.DataFrame(design)


In [10]:
# --- all iterations for sample selection

from datetime import datetime

print('finished: ',datetime.now())



selected_samples_dict = {}


for ii in range(len(design)):
    
    if ii%100==0:
        print(ii)

    

    design_setting = design[ii]    

    # - class sample selection

    my_sample_selection = sample_selection(data_class.get_cal()["xcal"], ncp = design_setting["npc"])  
    my_sample_selection.get_xcal_pca_scores(first_ncp = 0)

    # - sample size

    n_sel = design_setting["sample_size"]

    # - input matrix type for sample selection function arguments

    if design_setting["npc"]==data_class.K:

        dim_red = False
        dist_measure = "euclidean"

    else:

        dim_red = True
        dist_measure = "mahalanobis"

    # - select samples based on all settings depending on method

    if design_setting["method_name"] == "random":

        current_samples = my_sample_selection.random_sample(Nout = n_sel)["sample_id"]

    elif design_setting["method_name"] == "ks":

        current_samples = my_sample_selection.kennard_stone(Nout = n_sel, fixed_samples=None, dim_reduction=dim_red, distance_measure=dist_measure)["sample_id"]

    elif design_setting["method_name"] == "duplex":

        current_samples = my_sample_selection.duplex(Nout = n_sel, dim_reduction=dim_red, distance_measure=dist_measure)["sample_id"]

    elif design_setting["method_name"] == "puchwein":

        current_samples = my_sample_selection.puchwein(Nout = n_sel, factor_k=0.0001, dim_reduction=dim_red, distance_measure=dist_measure)["sample_id"]

    elif design_setting["method_name"] == "clustering":

        current_samples = my_sample_selection.clustering(Nout = n_sel , dim_reduction=dim_red, distance_measure=dist_measure)["sample_id"]

    elif design_setting["method_name"] == "simplisma":

        current_samples = my_sample_selection.simplisma(Nout = n_sel, fixed_samples = None, alpha_factor = 0.01, center=True)["sample_id"]

    elif design_setting["method_name"] == "successive_projections":

        current_samples = my_sample_selection.successive_projections(Nout = n_sel, fixed_samples = None, center = True)["sample_id"]

    elif design_setting["method_name"] == "shenkwest":

        current_samples = my_sample_selection.shenkwest(Nout = n_sel, rm_outlier = False, dim_reduction=dim_red, distance_measure=dist_measure)["sample_id"]

    elif design_setting["method_name"] == "honigs":

        current_samples = my_sample_selection.honigs(Nout = n_sel)["sample_id"]


    # - save result
    
    if current_samples.sum()>=n_sel:
        
        design_setting["selected_samples"] = current_samples.flatten()


        selected_samples_dict[str(ii)] = design_setting


print('finished: ',datetime.now())

df_output = pd.DataFrame.from_dict(selected_samples_dict, orient="index")


finished:  2020-09-04 17:18:34.938270
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
finished:  2020-09-04 17:21:33.940205


In [12]:
df_output.to_pickle(base_dir + "/experiments/" + experimentID + "/output/" + caseID_key + "_01_design_selected_samples.pkl")


In [13]:
df_output.shape

(4150, 5)