### Execution of experiments, collection of results and saving to csv files.

In [None]:
# load datasets

from data import Datasets

datasets = [
    {
        "name": "Breast",
        "train_size": 260,
    },
    {
        "name": "Colorectal",
        "train_size": 170,
    },
    {
        "name": "Leukemia",
        "train_size": 80,
    },
    {
        "name": "Liver",
        "train_size": 140,
    },
    {
        "name": "Prostate",
        "train_size": 100,
    },
    {
        "name": "Renal",
        "train_size": 120,
    },
    {
        "name": "Throat",
        "train_size": 80,
    },
]
for ds in datasets:
    ds["data"] = Datasets.get(ds["name"])
    print(f'{ds["name"]}\tX.shape={ds["data"]["X"].shape}\t#objects in classes:{list(ds["data"]["y"].value_counts())}')

In [None]:
# common parameters

# number of objects by which the training sets differ
ps = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20]

# maximum number of cross-validation steps, to reduce calculation time
max_cv_steps = 10

# random initial states, in order to obtain reproducibility
random_states = [42, 16, 101]

#### Single classifier experiments

In [None]:
# Single classifier experiment
# classifiers: LogisticRegression
# datasets: Breast, Colorectal, Leukemia, Liver, Prostate, Renal, Throat


from sklearn.linear_model import LogisticRegression
from src.experiment import experiment


classifiers_lr = [
    {
        "name": "1_LR_C015",
        "estimator": LogisticRegression(C=0.15, penalty='l1', solver='liblinear', max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "2_LR_C05",
        "estimator": LogisticRegression(C=0.5, penalty='l1', solver='liblinear', max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "3_LR_C075",
        "estimator": LogisticRegression(C=0.75, penalty='l1', solver='liblinear', max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
]

results_df = experiment(classifiers_lr, datasets, ps, max_cv_steps, random_states, verbose=True)
filename = f"results/single_classifier/results_lr.csv"
results_df.to_csv(filename, index=False)

In [None]:
# Single classifier experiment
# classifiers: SVM
# datasets: Breast, Colorectal, Leukemia, Liver, Prostate, Renal, Throat


from sklearn.svm import LinearSVC
from src.experiment import experiment


classifiers_svm = [
    {
        "name": "1_SVM_C005",
        "estimator": LinearSVC(C=0.05, penalty="l1", dual=False, max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "2_SVM_C035",
        "estimator": LinearSVC(C=0.35, penalty="l1", dual=False, max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "2_SVM_C06",
        "estimator": LinearSVC(C=0.6, penalty="l1", dual=False, max_iter=150000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
]

results_df = experiment(classifiers_svm, datasets, ps, max_cv_steps, random_states, verbose=True)
filename = f"results/single_classifier/results_svm.csv"
results_df.to_csv(filename, index=False)

In [None]:
# Single classifier experiment
# classifiers: CPL
# datasets: Breast, Colorectal, Leukemia, Liver, Prostate, Renal, Throat


from src.cpl.classifiers import GenetClassifier
from src.experiment import experiment


classifiers_cpl = [
    {
        "name": "1_CPL_C06",
        "estimator": GenetClassifier(C=0.6),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "2_CPL_C08",
        "estimator": GenetClassifier(C=0.8),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "3_CPL_C092",
        "estimator": GenetClassifier(C=0.92),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
]

results_df = experiment(classifiers_cpl, datasets, ps, max_cv_steps, random_states, verbose=True)
filename = f"results/single_classifier/results_cpl.csv"
results_df.to_csv(filename, index=False)

In [None]:
# Single classifier experiment
# classifiers: RandomForest
# datasets: Breast, Colorectal, Leukemia, Liver, Prostate, Renal, Throat

from sklearn.ensemble import RandomForestClassifier
from src.experiment import experiment


classifiers_rf = [
    {
        "name": "1_RF_t6e-3",
        "estimator": RandomForestClassifier(n_estimators=100, random_state=0, min_samples_leaf=2),
        "threshold": 6e-3,
        "importance_getter": "auto"
    },
    {
        "name": "2_RF_t3e-3",
        "estimator": RandomForestClassifier(n_estimators=100, random_state=0, min_samples_leaf=2),
        "threshold": 3e-3,
        "importance_getter": "auto"
    },
    {
        "name": "3_RF_t1e-3",
        "estimator": RandomForestClassifier(n_estimators=100, random_state=0, min_samples_leaf=2),
        "threshold": 1e-3,
        "importance_getter": "auto"
    },
]

results_df = experiment(classifiers_rf, datasets, ps, max_cv_steps, random_states, verbose=True, buffer=True)
filename = f"results/single_classifier/results_rf.csv"
results_df.to_csv(filename, index=False)

#### Multi classifiers experiments

In [None]:
# Multi classifiers experiment
# classifiers: CPL, RandomForest, LogisticRegression, SVM
# datasets: Breast

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from src.cpl.classifiers import GenetClassifier
from src.experiment import experiment


dataset_name = "Breast"

classifiers_breast = [
    {
        "name": "RF",
        "estimator": RandomForestClassifier(n_estimators=100, random_state=0, min_samples_leaf=2),
        "threshold": 4.65e-3,
        "importance_getter": "auto"
    },
    {
        "name": "LR",
        "estimator": LogisticRegression(C=0.118, penalty='l1', solver='liblinear', max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "SVM",
        "estimator": LinearSVC(C=0.042, penalty="l1", dual=False, max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "CPL",
        "estimator": GenetClassifier(C=0.946),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
]

for ds in datasets:
    if ds["name"] == dataset_name:
        dataset = ds
        break

results_df = experiment(classifiers_breast, [dataset,], ps, max_cv_steps, random_states, verbose=True, buffer=True)
filename = f"results/multi_classifiers/results_{dataset_name.lower()}.csv"
results_df.to_csv(filename, index=False)

In [None]:
# Multi classifiers experiment
# classifiers: CPL, RandomForest, LogisticRegression, SVM
# datasets: Colorectal

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from src.cpl.classifiers import GenetClassifier
from src.experiment import experiment


dataset_name = "Colorectal"

classifiers_colorectal = [
    {
        "name": "RF",
        "estimator": RandomForestClassifier(n_estimators=100, random_state=0, min_samples_leaf=2),
        "threshold": 9.87e-3,
        "importance_getter": "auto"
    },
    {
        "name": "LR",
        "estimator": LogisticRegression(C=0.07, penalty='l1', solver='liblinear', max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "SVM",
        "estimator": LinearSVC(C=0.35, penalty="l1", dual=False, max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "CPL",
        "estimator": GenetClassifier(C=0.95),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
]

for ds in datasets:
    if ds["name"] == dataset_name:
        dataset = ds
        break

results_df = experiment(classifiers_colorectal, [dataset,], ps, max_cv_steps, random_states, verbose=True, buffer=True)
filename = f"results/multi_classifiers/results_{dataset_name.lower()}.csv"
results_df.to_csv(filename, index=False)

In [None]:
# Multi classifiers experiment
# classifiers: CPL, RandomForest, LogisticRegression, SVM
# datasets: Leukemia

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from src.cpl.classifiers import GenetClassifier
from src.experiment import experiment


dataset_name = "Leukemia"

classifiers_leukemia = [
    {
        "name": "RF",
        "estimator": RandomForestClassifier(n_estimators=100, random_state=0, min_samples_leaf=2),
        "threshold": 8.8e-3,
        "importance_getter": "auto"
    },
    {
        "name": "LR",
        "estimator": LogisticRegression(C=0.043, penalty='l1', solver='liblinear', max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "SVM",
        "estimator": LinearSVC(C=0.14, penalty="l1", dual=False, max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "CPL",
        "estimator": GenetClassifier(C=0.95),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
] 

for ds in datasets:
    if ds["name"] == dataset_name:
        dataset = ds
        break

results_df = experiment(classifiers_leukemia, [dataset,], ps, max_cv_steps, random_states, verbose=True, buffer=True)
filename = f"results/multi_classifiers/results_{dataset_name.lower()}.csv"
results_df.to_csv(filename, index=False)

In [None]:
# Multi classifiers experiment
# classifiers: CPL, RandomForest, LogisticRegression, SVM
# datasets: Liver

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from src.cpl.classifiers import GenetClassifier
from src.experiment import experiment


dataset_name = "Liver"

classifiers_liver = [
    {
        "name": "RF",
        "estimator": RandomForestClassifier(n_estimators=100, random_state=0, min_samples_leaf=2),
        "threshold": 7.7e-3,
        "importance_getter": "auto"
    },
    {
        "name": "LR",
        "estimator": LogisticRegression(C=0.0165, penalty='l1', solver='liblinear', max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "SVM",
        "estimator": LinearSVC(C=0.036, penalty="l1", dual=False, max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "CPL",
        "estimator": GenetClassifier(C=0.92),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
]

for ds in datasets:
    if ds["name"] == dataset_name:
        dataset = ds
        break

results_df = experiment(classifiers_liver, [dataset,], ps, max_cv_steps, random_states, verbose=True, buffer=True)
filename = f"results/multi_classifiers/results_{dataset_name.lower()}.csv"
results_df.to_csv(filename, index=False)

In [None]:
# Multi classifiers experiment
# classifiers: CPL, RandomForest, LogisticRegression, SVM
# datasets: Prostate

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from src.cpl.classifiers import GenetClassifier
from src.experiment import experiment


dataset_name = "Prostate"

classifiers_prostate = [
    {
        "name": "RF",
        "estimator": RandomForestClassifier(n_estimators=100, random_state=0, min_samples_leaf=2),
        "threshold": 3.9e-3,
        "importance_getter": "auto"
    },
    {
        "name": "LR",
        "estimator": LogisticRegression(C=0.33, penalty='l1', solver='liblinear', max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "SVM",
        "estimator": LinearSVC(C=0.09, penalty="l1", dual=False, max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "CPL",
        "estimator": GenetClassifier(C=0.875),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
] 

for ds in datasets:
    if ds["name"] == dataset_name:
        dataset = ds
        break

results_df = experiment(classifiers_prostate, [dataset,], ps, max_cv_steps, random_states, verbose=True, buffer=True)
filename = f"results/multi_classifiers/results_{dataset_name.lower()}.csv"
results_df.to_csv(filename, index=False)

In [None]:
# Multi classifiers experiment
# classifiers: CPL, RandomForest, LogisticRegression, SVM
# datasets: Renal

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from src.cpl.classifiers import GenetClassifier
from src.experiment import experiment


dataset_name = "Renal"

classifiers_renal = [
    {
        "name": "RF",
        "estimator": RandomForestClassifier(n_estimators=100, random_state=0, min_samples_leaf=2),
        "threshold": 6.8e-3,
        "importance_getter": "auto"
    },
    {
        "name": "LR",
        "estimator": LogisticRegression(C=0.097, penalty='l1', solver='liblinear', max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "SVM",
        "estimator": LinearSVC(C=0.04, penalty="l1", dual=False, max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "CPL",
        "estimator": GenetClassifier(C=0.81),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
]

for ds in datasets:
    if ds["name"] == dataset_name:
        dataset = ds
        break

results_df = experiment(classifiers_renal, [dataset,], ps, max_cv_steps, random_states, verbose=True, buffer=True)
filename = f"results/multi_classifiers/results_{dataset_name.lower()}.csv"
results_df.to_csv(filename, index=False)

In [None]:
# Multi classifiers experiment
# classifiers: CPL, RandomForest, LogisticRegression, SVM
# datasets: Throat

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from src.cpl.classifiers import GenetClassifier
from src.experiment import experiment


dataset_name = "Throat"

classifiers_throat = [
    {
        "name": "RF",
        "estimator": RandomForestClassifier(n_estimators=100, random_state=0, min_samples_leaf=2),
        "threshold": 7.2e-3,
        "importance_getter": "auto"
    },
    {
        "name": "LR",
        "estimator": LogisticRegression(C=0.035, penalty='l1', solver='liblinear', max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "SVM",
        "estimator": LinearSVC(C=0.1, penalty="l1", dual=False, max_iter=100000),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
    {
        "name": "CPL",
        "estimator": GenetClassifier(C=0.93),
        "threshold": 1e-8,
        "importance_getter": "auto"
    },
]

for ds in datasets:
    if ds["name"] == dataset_name:
        dataset = ds
        break

results_df = experiment(classifiers_throat, [dataset,], ps, max_cv_steps, random_states, verbose=True, buffer=True)
filename = f"results/multi_classifiers/results_{dataset_name.lower()}.csv"
results_df.to_csv(filename, index=False)