In [1]:
# --------------------------------------------------------------------------

# ----------------- design to establish all conditions for  unsupervised sample selection

# -------------------------------------------------------------------------


# ¡¡¡ --- !!! # ---> modules and data cases

# --- system modules

import sys
import datetime
import os


base_dir = os.path.abspath(os.getcwd() + "/..")

# --- data handling modules

import numpy as np
import pandas as pd
import scipy.io as sp_io
import scipy as sp

# --- visualization modules

import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import rcParams
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# --- my modules

methods_dir = base_dir + '/methods/'  
sys.path.insert(0, methods_dir)
from class_sample_selection import sample_selection
import simpls_module



# ¡¡¡ --- !!! # ---> base working directory and available data cases

# # ************************************ init --- user 
caseID_key = "d02_manure"
y_id = 1 # DM
# # ************************************ end --- user 



# # ¡¡¡ --- !!! # ---> data

mat_filename = base_dir + "/data/" + caseID_key +".mat"
data_mat = sp_io.loadmat(mat_filename, struct_as_record = False)



xcal = data_mat["xcal"].copy()
ycal = data_mat["ycal"].copy()[:,[y_id]]
xtest = data_mat["xtest"].copy()
ytest = data_mat["ytest"].copy()[:,[y_id]]
chemical_comp_name = data_mat["y_labels"][y_id]

print("--- data ready ---")


--- data ready ---


In [2]:
# --- factors

K = xcal.shape[1]
ncal = xcal.shape[0]
npc_range = list(range(1,31))
npc_range.append(K)
method_name = ["random","ks","duplex","puchwein","clustering","optfederov"]
sample_sizes = list(range(30,ncal,10))



In [3]:
# --- making design


design = []


        
for pc in npc_range:

    for method in method_name:

        for ss in sample_sizes:
            
            if method!="optfederov" or (pc<=30 and (ss-pc)>=5):                

                run = {"npc":pc,
                       "method_name":method,
                       "sample_size":ss}
                
            

                design.append(run)
                    
# pd.DataFrame(design)
print("done")
design[-1]
len(design)

done


7210

In [4]:
df = pd.DataFrame(design)
df[(df["method_name"]=="optfederov")&(df["sample_size"]<40)]

Unnamed: 0,method_name,npc,sample_size
195,optfederov,1,30
429,optfederov,2,30
663,optfederov,3,30
897,optfederov,4,30
1131,optfederov,5,30
1365,optfederov,6,30
1599,optfederov,7,30
1833,optfederov,8,30
2067,optfederov,9,30
2301,optfederov,10,30


In [5]:
# --- all iterations for sample selection

from datetime import datetime

print('finished: ',datetime.now())



selected_samples_dict = {}


for ii in range(len(design)):
    
    if ii%100==0:
        print(ii)

    

    design_setting = design[ii]    

    # - class sample selection

    my_sample_selection = sample_selection(xcal, ncp = design_setting["npc"])  
    my_sample_selection.get_xcal_pca_scores(first_ncp = 0)

    # - sample size

    n_sel = design_setting["sample_size"]

    # - input matrix type for sample selection function arguments

    if design_setting["npc"]>30:

        dim_red = False
        dist_measure = "euclidean"

    else:

        dim_red = True
        dist_measure = "mahalanobis"

    # - select samples based on all settings depending on method

    if design_setting["method_name"] == "random":

        current_samples = my_sample_selection.random_sample(Nout = n_sel)["sample_id"]

    if design_setting["method_name"] == "ks":

        current_samples = my_sample_selection.kennard_stone(Nout = n_sel, fixed_samples=None, dim_reduction=dim_red, distance_measure=dist_measure)["sample_id"]

    if design_setting["method_name"] == "duplex":

        current_samples = my_sample_selection.duplex(Nout = n_sel, dim_reduction=dim_red, distance_measure=dist_measure)["sample_id"]

    if design_setting["method_name"] == "puchwein":

        current_samples = my_sample_selection.puchwein(Nout = n_sel, factor_k=0.0001, dim_reduction=dim_red, distance_measure=dist_measure)["sample_id"]

    if design_setting["method_name"] == "clustering":

        current_samples = my_sample_selection.clustering(Nout = n_sel , dim_reduction=dim_red, distance_measure=dist_measure)["sample_id"]

    if design_setting["method_name"] == "optfederov" and design_setting["npc"] <= 30:

        current_samples = my_sample_selection.optfederov_r(Nout = n_sel, fixed_samples=None, optimality_criterion='D')["sample_id"]


    # - save result

    design_setting["selected_samples"] = current_samples.flatten()


    selected_samples_dict[str(ii)] = design_setting


print('finished: ',datetime.now())

# ¡¡¡ --- !!! ---> save output 

sp_io.savemat(base_dir + "/scripts_output/" + caseID_key + "_01_design_selected_samples.mat", selected_samples_dict)
df_output = pd.DataFrame.from_dict(selected_samples_dict, orient="index")
df_output.to_pickle(base_dir + "/scripts_output/" + caseID_key + "_01_design_selected_samples.pkl")

finished:  2020-10-28 19:57:21.029551
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
finished:  2020-10-28 20:13:28.752374
