In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import random

from multiprocessing import Pool
from functools import partial

from sklearn.model_selection import KFold

from scipy.stats import ttest_ind

from prettytable import PrettyTable

In [9]:
import performances_evaluation

In [10]:
import smote_cd

# Tecator dataset 

Comes from : https://www.openml.org/search?type=data&sort=runs&id=505&status=active

The article "Ace of bayes: Application of neural networks with pruning" (1993), using a Neural Network, had an RMSE of 0.52 (or 0.36 ?)

In [11]:
from scipy.io import arff

data = arff.loadarff('data/tecator.arff')
df = pd.DataFrame(data[0])

In [12]:
df

Unnamed: 0,absorbance_1,absorbance_2,absorbance_3,absorbance_4,absorbance_5,absorbance_6,absorbance_7,absorbance_8,absorbance_9,absorbance_10,...,principal_component_16,principal_component_17,principal_component_18,principal_component_19,principal_component_20,principal_component_21,principal_component_22,moisture,fat,protein
0,2.61776,2.61814,2.61859,2.61912,2.61981,2.62071,2.62186,2.62334,2.62511,2.62722,...,0.233876,0.899661,-0.042006,0.708717,-0.254173,-0.391579,-0.419313,60.5,22.5,16.7
1,2.83454,2.83871,2.84283,2.84705,2.85138,2.85587,2.86060,2.86566,2.87093,2.87661,...,-0.195980,-0.046359,-0.634996,-0.354403,0.014650,0.269542,0.096692,46.0,40.1,13.5
2,2.58284,2.58458,2.58629,2.58808,2.58996,2.59192,2.59401,2.59627,2.59873,2.60131,...,0.795466,0.762246,-1.268580,0.690588,-0.785592,-0.671237,-0.710290,71.0,8.4,20.5
3,2.82286,2.82460,2.82630,2.82814,2.83001,2.83192,2.83392,2.83606,2.83842,2.84097,...,0.616500,-0.388684,-2.068390,1.276160,-0.259437,-0.140206,-0.165963,72.8,5.9,20.7
4,2.78813,2.78989,2.79167,2.79350,2.79538,2.79746,2.79984,2.80254,2.80553,2.80890,...,-1.095540,-1.348450,0.453559,0.402624,0.031307,0.021824,0.145777,58.3,25.5,15.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,3.22518,3.23016,3.23505,3.23988,3.24476,3.24971,3.25472,3.25980,3.26498,3.27041,...,-0.897510,-1.094940,-0.047774,-1.072080,0.683963,0.743673,0.824097,70.0,6.6,23.2
236,3.30792,3.31259,3.31733,3.32205,3.32683,3.33169,3.33663,3.34170,3.34700,3.35242,...,-1.033330,-1.196250,0.991593,-0.987896,0.813800,0.919915,0.950013,69.7,7.1,23.2
237,2.66103,2.66344,2.66582,2.66820,2.67065,2.67321,2.67588,2.67864,2.68160,2.68481,...,0.413767,0.500473,-0.232210,0.160583,-0.456995,-0.425684,-0.438117,69.1,7.7,22.9
238,3.02438,3.02997,3.03550,3.04102,3.04653,3.05209,3.05769,3.06330,3.06903,3.07484,...,-0.250646,-0.298301,-0.223632,-0.436276,0.372878,0.288446,0.383060,66.3,10.4,22.4


In [13]:
X = np.array(df[df.columns[:100]])

In [14]:
Y = np.array(df[['moisture','fat','protein']])/100

# Results with GB

In [190]:
%%time
list_iter = np.arange(100)
with Pool(10) as pool:
    all_res_parallel_gb = pool.map(partial(performances_evaluation.eval_perf_gb, X=X, Y=Y, k_folds=10), list_iter)

Wall time: 15min


In [452]:
r2_compositional_tot_gb, r2_raw_tot_gb, r2_logratio_tot_gb = [],[],[]
logloss_compositional_tot_gb, logloss_raw_tot_gb, logloss_logratio_tot_gb = [], [], []
rmse_compositional_tot_gb, rmse_raw_tot_gb, rmse_logratio_tot_gb = [], [], []
accuracy_compositional_tot_gb, accuracy_raw_tot_gb, accuracy_logratio_tot_gb = [], [], []
f1_compositional_tot_gb, f1_raw_tot_gb, f1_logratio_tot_gb = [], [], []

r2_compositional_std_gb, r2_raw_std_gb, r2_logratio_std_gb = [], [], []
logloss_compositional_std_gb, logloss_raw_std_gb, logloss_logratio_std_gb = [], [], []
rmse_compositional_std_gb, rmse_raw_std_gb, rmse_logratio_std_gb = [], [], []
accuracy_compositional_std_gb, accuracy_raw_std_gb, accuracy_logratio_std_gb = [], [], []
f1_compositional_std_gb, f1_raw_std_gb, f1_logratio_std_gb = [], [], []

r2_compositional, r2_raw, r2_logratio = [], [], []
logloss_compositional, logloss_raw, logloss_logratio = [], [], []
rmse_compositional, rmse_raw, rmse_logratio = [], [], []
accuracy_compositional, accuracy_raw, accuracy_logratio = [], [], []
f1_compositional, f1_raw, f1_logratio = [], [], []

In [453]:
for i in range(len(all_res_parallel_gb)):
    r2_compositional_temp, r2_raw_temp, r2_logratio_temp, logloss_compositional_temp, logloss_raw_temp, logloss_logratio_temp, rmse_compositional_temp, rmse_raw_temp, rmse_logratio_temp, accuracy_compositional_temp, accuracy_raw_temp, accuracy_logratio_temp, f1_compositional_temp, f1_raw_temp, f1_logratio_temp = all_res_parallel_gb[i]
    r2_compositional.append(np.round(np.mean(r2_compositional_temp,axis=0),5))
    r2_raw.append(np.round(np.mean(r2_raw_temp,axis=0),5))
    r2_logratio.append(np.round(np.mean(r2_logratio_temp,axis=0),5))
    logloss_compositional.append(np.round(np.mean(logloss_compositional_temp),5))
    logloss_raw.append(np.round(np.mean(logloss_raw_temp),5))
    logloss_logratio.append(np.round(np.mean(logloss_logratio_temp),5))
    rmse_compositional.append(np.round(np.mean(rmse_compositional_temp),5))
    rmse_raw.append(np.round(np.mean(rmse_raw_temp),5))
    rmse_logratio.append(np.round(np.mean(rmse_logratio_temp),5))
    accuracy_compositional.append(np.round(np.mean(accuracy_compositional_temp),5))
    accuracy_raw.append(np.round(np.mean(accuracy_raw_temp),5))
    accuracy_logratio.append(np.round(np.mean(accuracy_logratio_temp),5))
    f1_compositional.append(np.round(np.mean(f1_compositional_temp,axis=0),5))
    f1_raw.append(np.round(np.mean(f1_raw_temp,axis=0),5))
    f1_logratio.append(np.round(np.mean(f1_logratio_temp,axis=0),5))

r2_compositional_tot_gb.append(np.round(np.mean(r2_compositional,axis=0),5))
r2_raw_tot_gb.append(np.round(np.mean(r2_raw,axis=0),5))
r2_logratio_tot_gb.append(np.round(np.mean(r2_logratio,axis=0),5))
logloss_compositional_tot_gb.append(np.round(np.mean(logloss_compositional),5))
logloss_raw_tot_gb.append(np.round(np.mean(logloss_raw),5))
logloss_logratio_tot_gb.append(np.round(np.mean(logloss_logratio),5))
rmse_compositional_tot_gb.append(np.round(np.mean(rmse_compositional),5))
rmse_raw_tot_gb.append(np.round(np.mean(rmse_raw),5))
rmse_logratio_tot_gb.append(np.round(np.mean(rmse_logratio),5))
accuracy_compositional_tot_gb.append(np.round(np.mean(accuracy_compositional),5))
accuracy_raw_tot_gb.append(np.round(np.mean(accuracy_raw),5))
accuracy_logratio_tot_gb.append(np.round(np.mean(accuracy_logratio),5))
f1_compositional_tot_gb.append(np.round(np.mean(f1_compositional,axis=0),5))
f1_raw_tot_gb.append(np.round(np.mean(f1_raw,axis=0),5))
f1_logratio_tot_gb.append(np.round(np.mean(f1_logratio,axis=0),5))

r2_compositional_std_gb.append(np.round(np.std(r2_compositional,axis=0),5))
r2_raw_std_gb.append(np.round(np.std(r2_raw,axis=0),5))
r2_logratio_std_gb.append(np.round(np.std(r2_logratio,axis=0),5))
logloss_compositional_std_gb.append(np.round(np.std(logloss_compositional),5))
logloss_raw_std_gb.append(np.round(np.std(logloss_raw),5))
logloss_logratio_std_gb.append(np.round(np.std(logloss_logratio),5))
rmse_compositional_std_gb.append(np.round(np.std(rmse_compositional),5))
rmse_raw_std_gb.append(np.round(np.std(rmse_raw),5))
rmse_logratio_std_gb.append(np.round(np.std(rmse_logratio),5))
accuracy_compositional_std_gb.append(np.round(np.std(accuracy_compositional),5))
accuracy_raw_std_gb.append(np.round(np.std(accuracy_raw),5))
accuracy_logratio_std_gb.append(np.round(np.std(accuracy_logratio),5))
f1_compositional_std_gb.append(np.round(np.std(f1_compositional,axis=0),5))
f1_raw_std_gb.append(np.round(np.std(f1_raw,axis=0),5))
f1_logratio_std_gb.append(np.round(np.std(f1_logratio,axis=0),5))

In [439]:
columns_names = ['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2']
rows_names = ['Raw', 'Compositional', 'Logratio']

pt = PrettyTable(['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2'])
pt.title = 'RESULTS ON TECATOR DATA WITH GRADIENT BOOSTING TREE'
pt.add_row(['Raw',accuracy_raw_tot_gb[0], logloss_raw_tot_gb[0], np.round(np.mean(f1_raw_tot_gb),5), rmse_raw_tot_gb[0], np.round(np.mean(r2_raw_tot_gb),5)])
pt.add_row(['Compositional',accuracy_compositional_tot_gb[0], logloss_compositional_tot_gb[0], np.round(np.mean(f1_compositional_tot_gb),5), rmse_compositional_tot_gb[0], np.round(np.mean(r2_compositional_tot_gb),5)])
pt.add_row(['Logratio',accuracy_logratio_tot_gb[0], logloss_logratio_tot_gb[0], np.round(np.mean(f1_logratio_tot_gb),5), rmse_logratio_tot_gb[0], np.round(np.mean(r2_logratio_tot_gb),5)])

print(pt)

+-------------------------------------------------------------------------+
|           RESULTS ON TECATOR DATA WITH GRADIENT BOOSTING TREE           |
+---------------+----------+---------------+----------+---------+---------+
|     Model     | Accuracy | Cross-entropy | F1-score |   RMSE  |    R2   |
+---------------+----------+---------------+----------+---------+---------+
|      Raw      | 0.93217  |    0.86032    | 0.70088  | 0.04584 | 0.71739 |
| Compositional | 0.95737  |    0.85992    | 0.83409  | 0.04393 | 0.72921 |
|    Logratio   | 0.95687  |    0.85991    | 0.82982  | 0.04382 | 0.72978 |
+---------------+----------+---------------+----------+---------+---------+


In [454]:
columns_names = ['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2']
rows_names = ['Raw', 'Compositional', 'Logratio']

pt = PrettyTable(['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2'])
pt.title = 'STD ON TECATOR DATA WITH GRADIENT BOOSTING TREE'
pt.add_row(['Raw',accuracy_raw_std_gb[0], logloss_raw_std_gb[0], np.round(np.mean(f1_raw_std_gb),5), rmse_raw_std_gb[0], np.round(np.mean(r2_raw_std_gb),5)])
pt.add_row(['Compositional',accuracy_compositional_std_gb[0], logloss_compositional_std_gb[0], np.round(np.mean(f1_compositional_std_gb),5), rmse_compositional_std_gb[0], np.round(np.mean(r2_compositional_std_gb),5)])
pt.add_row(['Logratio',accuracy_logratio_std_gb[0], logloss_logratio_std_gb[0], np.round(np.mean(f1_logratio_std_gb),5), rmse_logratio_std_gb[0], np.round(np.mean(r2_logratio_std_gb),5)])

print(pt)

+-------------------------------------------------------------------------+
|             STD ON TECATOR DATA WITH GRADIENT BOOSTING TREE             |
+---------------+----------+---------------+----------+---------+---------+
|     Model     | Accuracy | Cross-entropy | F1-score |   RMSE  |    R2   |
+---------------+----------+---------------+----------+---------+---------+
|      Raw      | 0.00575  |    0.00116    |  0.042   |  0.0013 | 0.02263 |
| Compositional | 0.00792  |    0.00152    |  0.0459  | 0.00163 | 0.02585 |
|    Logratio   | 0.00804  |    0.00141    | 0.04597  | 0.00167 | 0.02668 |
+---------------+----------+---------------+----------+---------+---------+


In [440]:
print(ttest_ind(accuracy_raw,accuracy_compositional))
print(ttest_ind(accuracy_raw,accuracy_logratio))

print(ttest_ind(logloss_raw,logloss_compositional))
print(ttest_ind(logloss_raw,logloss_logratio))

print(ttest_ind(rmse_raw,rmse_compositional))
print(ttest_ind(rmse_raw,rmse_logratio))

print(ttest_ind(np.mean(r2_raw,axis=1),np.mean(r2_compositional,axis=1)))
print(ttest_ind(np.mean(r2_raw,axis=1),np.mean(r2_logratio,axis=1)))

print(ttest_ind(np.mean(f1_raw,axis=1),np.mean(f1_compositional,axis=1)))
print(ttest_ind(np.mean(f1_raw,axis=1),np.mean(f1_logratio,axis=1)))

Ttest_indResult(statistic=-25.62298298982579, pvalue=8.672155738269694e-65)
Ttest_indResult(statistic=-24.867728167027167, pvalue=8.015573986360687e-63)
Ttest_indResult(statistic=2.073059172330889, pvalue=0.0394613456387005)
Ttest_indResult(statistic=2.225164276997334, pvalue=0.0271987291085655)
Ttest_indResult(statistic=9.108360461503038, pvalue=9.260034851331745e-17)
Ttest_indResult(statistic=9.482608266026887, pvalue=7.999678225365742e-18)
Ttest_indResult(statistic=-4.019823234658909, pvalue=8.274579392418546e-05)
Ttest_indResult(statistic=-4.125454311054905, pvalue=5.445328809763592e-05)
Ttest_indResult(statistic=-21.643383388029434, pvalue=4.4306211120528716e-54)
Ttest_indResult(statistic=-20.919722281480716, pvalue=4.846477892261697e-52)


# With NN

In [442]:
%%time
list_iter = np.arange(100)
with Pool(10) as pool:
    all_res_parallel_nn = pool.map(partial(performances_evaluation.eval_perf_nn, X=X, Y=Y, k_folds=10), list_iter)

Wall time: 11min 31s


In [443]:
r2_compositional_tot_nn, r2_raw_tot_nn, r2_logratio_tot_nn = [],[],[]
logloss_compositional_tot_nn, logloss_raw_tot_nn, logloss_logratio_tot_nn = [], [], []
rmse_compositional_tot_nn, rmse_raw_tot_nn, rmse_logratio_tot_nn = [], [], []
accuracy_compositional_tot_nn, accuracy_raw_tot_nn, accuracy_logratio_tot_nn = [], [], []
f1_compositional_tot_nn, f1_raw_tot_nn, f1_logratio_tot_nn = [], [], []

r2_compositional_std_nn, r2_raw_std_nn, r2_logratio_std_nn = [], [], []
logloss_compositional_std_nn, logloss_raw_std_nn, logloss_logratio_std_nn = [], [], []
rmse_compositional_std_nn, rmse_raw_std_nn, rmse_logratio_std_nn = [], [], []
accuracy_compositional_std_nn, accuracy_raw_std_nn, accuracy_logratio_std_nn = [], [], []
f1_compositional_std_nn, f1_raw_std_nn, f1_logratio_std_nn = [], [], []

r2_compositional, r2_raw, r2_logratio = [], [], []
logloss_compositional, logloss_raw, logloss_logratio = [], [], []
rmse_compositional, rmse_raw, rmse_logratio = [], [], []
accuracy_compositional, accuracy_raw, accuracy_logratio = [], [], []
f1_compositional, f1_raw, f1_logratio = [], [], []

In [444]:
for i in range(len(all_res_parallel_nn)):
    r2_compositional_temp, r2_raw_temp, r2_logratio_temp, logloss_compositional_temp, logloss_raw_temp, logloss_logratio_temp, rmse_compositional_temp, rmse_raw_temp, rmse_logratio_temp, accuracy_compositional_temp, accuracy_raw_temp, accuracy_logratio_temp, f1_compositional_temp, f1_raw_temp, f1_logratio_temp = all_res_parallel_nn[i]
    r2_compositional.append(np.round(np.mean(r2_compositional_temp,axis=0),5))
    r2_raw.append(np.round(np.mean(r2_raw_temp,axis=0),5))
    r2_logratio.append(np.round(np.mean(r2_logratio_temp,axis=0),5))
    logloss_compositional.append(np.round(np.mean(logloss_compositional_temp),5))
    logloss_raw.append(np.round(np.mean(logloss_raw_temp),5))
    logloss_logratio.append(np.round(np.mean(logloss_logratio_temp),5))
    rmse_compositional.append(np.round(np.mean(rmse_compositional_temp),5))
    rmse_raw.append(np.round(np.mean(rmse_raw_temp),5))
    rmse_logratio.append(np.round(np.mean(rmse_logratio_temp),5))
    accuracy_compositional.append(np.round(np.mean(accuracy_compositional_temp),5))
    accuracy_raw.append(np.round(np.mean(accuracy_raw_temp),5))
    accuracy_logratio.append(np.round(np.mean(accuracy_logratio_temp),5))
    f1_compositional.append(np.round(np.mean(f1_compositional_temp,axis=0),5))
    f1_raw.append(np.round(np.mean(f1_raw_temp,axis=0),5))
    f1_logratio.append(np.round(np.mean(f1_logratio_temp,axis=0),5))

r2_compositional_tot_nn.append(np.round(np.mean(r2_compositional,axis=0),5))
r2_raw_tot_nn.append(np.round(np.mean(r2_raw,axis=0),5))
r2_logratio_tot_nn.append(np.round(np.mean(r2_logratio,axis=0),5))
logloss_compositional_tot_nn.append(np.round(np.mean(logloss_compositional),5))
logloss_raw_tot_nn.append(np.round(np.mean(logloss_raw),5))
logloss_logratio_tot_nn.append(np.round(np.mean(logloss_logratio),5))
rmse_compositional_tot_nn.append(np.round(np.mean(rmse_compositional),5))
rmse_raw_tot_nn.append(np.round(np.mean(rmse_raw),5))
rmse_logratio_tot_nn.append(np.round(np.mean(rmse_logratio),5))
accuracy_compositional_tot_nn.append(np.round(np.mean(accuracy_compositional),5))
accuracy_raw_tot_nn.append(np.round(np.mean(accuracy_raw),5))
accuracy_logratio_tot_nn.append(np.round(np.mean(accuracy_logratio),5))
f1_compositional_tot_nn.append(np.round(np.mean(f1_compositional,axis=0),5))
f1_raw_tot_nn.append(np.round(np.mean(f1_raw,axis=0),5))
f1_logratio_tot_nn.append(np.round(np.mean(f1_logratio,axis=0),5))

r2_compositional_std_nn.append(np.round(np.std(r2_compositional,axis=0),5))
r2_raw_std_nn.append(np.round(np.std(r2_raw,axis=0),5))
r2_logratio_std_nn.append(np.round(np.std(r2_logratio,axis=0),5))
logloss_compositional_std_nn.append(np.round(np.std(logloss_compositional),5))
logloss_raw_std_nn.append(np.round(np.std(logloss_raw),5))
logloss_logratio_std_nn.append(np.round(np.std(logloss_logratio),5))
rmse_compositional_std_nn.append(np.round(np.std(rmse_compositional),5))
rmse_raw_std_nn.append(np.round(np.std(rmse_raw),5))
rmse_logratio_std_nn.append(np.round(np.std(rmse_logratio),5))
accuracy_compositional_std_nn.append(np.round(np.std(accuracy_compositional),5))
accuracy_raw_std_nn.append(np.round(np.std(accuracy_raw),5))
accuracy_logratio_std_nn.append(np.round(np.std(accuracy_logratio),5))
f1_compositional_std_nn.append(np.round(np.std(f1_compositional,axis=0),5))
f1_raw_std_nn.append(np.round(np.std(f1_raw,axis=0),5))
f1_logratio_std_nn.append(np.round(np.std(f1_logratio,axis=0),5))

In [445]:
columns_names = ['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2']
rows_names = ['Raw', 'Compositional', 'Logratio']

pt = PrettyTable(['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2'])
pt.title = 'RESULTS ON TECATOR DATA WITH NEURAL NETWORK'
pt.add_row(['Raw',accuracy_raw_tot_nn[0], logloss_raw_tot_nn[0], np.round(np.mean(f1_raw_tot_nn),5), rmse_raw_tot_nn[0], np.round(np.mean(r2_raw_tot_nn),5)])
pt.add_row(['Compositional',accuracy_compositional_tot_nn[0], logloss_compositional_tot_nn[0], np.round(np.mean(f1_compositional_tot_nn),5), rmse_compositional_tot_nn[0], np.round(np.mean(r2_compositional_tot_nn),5)])
pt.add_row(['Logratio',accuracy_logratio_tot_nn[0], logloss_logratio_tot_nn[0], np.round(np.mean(f1_logratio_tot_nn),5), rmse_logratio_tot_nn[0], np.round(np.mean(r2_logratio_tot_nn),5)])

print(pt)

+--------------------------------------------------------------------------+
|               RESULTS ON TECATOR DATA WITH NEURAL NETWORK                |
+---------------+----------+---------------+----------+---------+----------+
|     Model     | Accuracy | Cross-entropy | F1-score |   RMSE  |    R2    |
+---------------+----------+---------------+----------+---------+----------+
|      Raw      | 0.90833  |    0.92784    | 0.51157  | 0.11315 | -1.23005 |
| Compositional | 0.90441  |    0.93732    | 0.51167  | 0.12246 | -1.15782 |
|    Logratio   | 0.90391  |    0.93757    | 0.51334  | 0.12242 | -1.15553 |
+---------------+----------+---------------+----------+---------+----------+


In [451]:
columns_names = ['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2']
rows_names = ['Raw', 'Compositional', 'Logratio']

pt = PrettyTable(['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2'])
pt.title = 'STD ON TECATOR DATA WITH NEURAL NETWORK'
pt.add_row(['Raw',accuracy_raw_std_nn[0], logloss_raw_std_nn[0], np.round(np.mean(f1_raw_std_nn),5), rmse_raw_std_nn[0], np.round(np.mean(r2_raw_std_nn),5)])
pt.add_row(['Compositional',accuracy_compositional_std_nn[0], logloss_compositional_std_nn[0], np.round(np.mean(f1_compositional_std_nn),5), rmse_compositional_std_nn[0], np.round(np.mean(r2_compositional_std_nn),5)])
pt.add_row(['Logratio',accuracy_logratio_std_nn[0], logloss_logratio_std_nn[0], np.round(np.mean(f1_logratio_std_nn),5), rmse_logratio_std_nn[0], np.round(np.mean(r2_logratio_std_nn),5)])

print(pt)

+-------------------------------------------------------------------------+
|                 STD ON TECATOR DATA WITH NEURAL NETWORK                 |
+---------------+----------+---------------+----------+---------+---------+
|     Model     | Accuracy | Cross-entropy | F1-score |   RMSE  |    R2   |
+---------------+----------+---------------+----------+---------+---------+
|      Raw      |   0.0    |    0.00888    | 0.03557  | 0.00455 | 0.48424 |
| Compositional | 0.01561  |    0.01025    | 0.04446  | 0.00714 |  0.4658 |
|    Logratio   | 0.01374  |    0.01036    | 0.04357  | 0.00709 | 0.44906 |
+---------------+----------+---------------+----------+---------+---------+


In [446]:
print(ttest_ind(accuracy_raw,accuracy_compositional))
print(ttest_ind(accuracy_raw,accuracy_logratio))

Ttest_indResult(statistic=2.4962812636535463, pvalue=0.013366538888636223)
Ttest_indResult(statistic=3.1976217697509752, pvalue=0.0016133137695019174)


  print(ttest_ind(accuracy_raw,accuracy_compositional))
  print(ttest_ind(accuracy_raw,accuracy_logratio))


In [447]:
print(ttest_ind(logloss_raw,logloss_compositional))
print(ttest_ind(logloss_raw,logloss_logratio))

Ttest_indResult(statistic=-6.960603897528682, pvalue=4.858596394217011e-11)
Ttest_indResult(statistic=-7.098901374546781, pvalue=2.1983758932171904e-11)


In [448]:
print(ttest_ind(rmse_raw,rmse_compositional))
print(ttest_ind(rmse_raw,rmse_logratio))

Ttest_indResult(statistic=-10.942304998546849, pvalue=4.236479719387598e-22)
Ttest_indResult(statistic=-10.956154287288452, pvalue=3.8516983819746614e-22)


In [449]:
print(ttest_ind(np.mean(r2_raw,axis=1),np.mean(r2_compositional,axis=1)))
print(ttest_ind(np.mean(r2_raw,axis=1),np.mean(r2_logratio,axis=1)))

Ttest_indResult(statistic=-1.385454881414254, pvalue=0.1674721242509481)
Ttest_indResult(statistic=-1.4578657076340034, pvalue=0.14646184548533367)


In [450]:
print(ttest_ind(np.mean(f1_raw,axis=1),np.mean(f1_compositional,axis=1)))
print(ttest_ind(np.mean(f1_raw,axis=1),np.mean(f1_logratio,axis=1)))

Ttest_indResult(statistic=-0.018915206468209864, pvalue=0.9849277957474469)
Ttest_indResult(statistic=-0.3458458377063507, pvalue=0.7298257623857743)


# With Dirichlet

With 100 features : 45min to make 5 iterations (with 5 threads and 10-folds). We thus take the PCA to reduce the number of features.

In [26]:
X_pca = np.array(df[df.columns[100:-3]])

In [58]:
%%time
list_iter = np.arange(50,100)
with Pool(10) as pool:
    all_res_parallel_dirichlet = pool.map(partial(performances_evaluation.eval_perf_dirichlet, X=X_pca, Y=Y, k_folds=10), list_iter)

Wall time: 32min 50s


In [46]:
r2_compositional_tot_dirichlet, r2_raw_tot_dirichlet, r2_logratio_tot_dirichlet = [],[],[]
logloss_compositional_tot_dirichlet, logloss_raw_tot_dirichlet, logloss_logratio_tot_dirichlet = [], [], []
rmse_compositional_tot_dirichlet, rmse_raw_tot_dirichlet, rmse_logratio_tot_dirichlet = [], [], []
accuracy_compositional_tot_dirichlet, accuracy_raw_tot_dirichlet, accuracy_logratio_tot_dirichlet = [], [], []
f1_compositional_tot_dirichlet, f1_raw_tot_dirichlet, f1_logratio_tot_dirichlet = [], [], []

r2_compositional_std_dirichlet, r2_raw_std_dirichlet, r2_logratio_std_dirichlet = [], [], []
logloss_compositional_std_dirichlet, logloss_raw_std_dirichlet, logloss_logratio_std_dirichlet = [], [], []
rmse_compositional_std_dirichlet, rmse_raw_std_dirichlet, rmse_logratio_std_dirichlet = [], [], []
accuracy_compositional_std_dirichlet, accuracy_raw_std_dirichlet, accuracy_logratio_std_dirichlet = [], [], []
f1_compositional_std_dirichlet, f1_raw_std_dirichlet, f1_logratio_std_dirichlet = [], [], []

In [59]:
for i in range(len(all_res_parallel_dirichlet)):
    r2_compositional_temp, r2_raw_temp, r2_logratio_temp, logloss_compositional_temp, logloss_raw_temp, logloss_logratio_temp, rmse_compositional_temp, rmse_raw_temp, rmse_logratio_temp, accuracy_compositional_temp, accuracy_raw_temp, accuracy_logratio_temp, f1_compositional_temp, f1_raw_temp, f1_logratio_temp = all_res_parallel_dirichlet[i]
    r2_compositional.append(np.round(np.mean(r2_compositional_temp,axis=0),5))
    r2_raw.append(np.round(np.mean(r2_raw_temp,axis=0),5))
    r2_logratio.append(np.round(np.mean(r2_logratio_temp,axis=0),5))
    logloss_compositional.append(np.round(np.mean(logloss_compositional_temp),5))
    logloss_raw.append(np.round(np.mean(logloss_raw_temp),5))
    logloss_logratio.append(np.round(np.mean(logloss_logratio_temp),5))
    rmse_compositional.append(np.round(np.mean(rmse_compositional_temp),5))
    rmse_raw.append(np.round(np.mean(rmse_raw_temp),5))
    rmse_logratio.append(np.round(np.mean(rmse_logratio_temp),5))
    accuracy_compositional.append(np.round(np.mean(accuracy_compositional_temp),5))
    accuracy_raw.append(np.round(np.mean(accuracy_raw_temp),5))
    accuracy_logratio.append(np.round(np.mean(accuracy_logratio_temp),5))
    f1_compositional.append(np.round(np.mean([np.mean(i) for i in f1_compositional_temp]),5))
    f1_raw.append(np.round(np.mean([np.mean(i) for i in f1_raw_temp]),5))
    f1_logratio.append(np.round(np.mean([np.mean(i) for i in f1_logratio_temp]),5))

r2_compositional_tot_dirichlet.append(np.round(np.mean(r2_compositional,axis=0),5))
r2_raw_tot_dirichlet.append(np.round(np.mean(r2_raw,axis=0),5))
r2_logratio_tot_dirichlet.append(np.round(np.mean(r2_logratio,axis=0),5))
logloss_compositional_tot_dirichlet.append(np.round(np.mean(logloss_compositional),5))
logloss_raw_tot_dirichlet.append(np.round(np.mean(logloss_raw),5))
logloss_logratio_tot_dirichlet.append(np.round(np.mean(logloss_logratio),5))
rmse_compositional_tot_dirichlet.append(np.round(np.mean(rmse_compositional),5))
rmse_raw_tot_dirichlet.append(np.round(np.mean(rmse_raw),5))
rmse_logratio_tot_dirichlet.append(np.round(np.mean(rmse_logratio),5))
accuracy_compositional_tot_dirichlet.append(np.round(np.mean(accuracy_compositional),5))
accuracy_raw_tot_dirichlet.append(np.round(np.mean(accuracy_raw),5))
accuracy_logratio_tot_dirichlet.append(np.round(np.mean(accuracy_logratio),5))
f1_compositional_tot_dirichlet.append(np.round(np.mean(f1_compositional,axis=0),5))
f1_raw_tot_dirichlet.append(np.round(np.mean(f1_raw,axis=0),5))
f1_logratio_tot_dirichlet.append(np.round(np.mean(f1_logratio,axis=0),5))

r2_compositional_std_dirichlet.append(np.round(np.std(r2_compositional,axis=0),5))
r2_raw_std_dirichlet.append(np.round(np.std(r2_raw,axis=0),5))
r2_logratio_std_dirichlet.append(np.round(np.std(r2_logratio,axis=0),5))
logloss_compositional_std_dirichlet.append(np.round(np.std(logloss_compositional),5))
logloss_raw_std_dirichlet.append(np.round(np.std(logloss_raw),5))
logloss_logratio_std_dirichlet.append(np.round(np.std(logloss_logratio),5))
rmse_compositional_std_dirichlet.append(np.round(np.std(rmse_compositional),5))
rmse_raw_std_dirichlet.append(np.round(np.std(rmse_raw),5))
rmse_logratio_std_dirichlet.append(np.round(np.std(rmse_logratio),5))
accuracy_compositional_std_dirichlet.append(np.round(np.std(accuracy_compositional),5))
accuracy_raw_std_dirichlet.append(np.round(np.std(accuracy_raw),5))
accuracy_logratio_std_dirichlet.append(np.round(np.std(accuracy_logratio),5))
f1_compositional_std_dirichlet.append(np.round(np.std(f1_compositional,axis=0),5))
f1_raw_std_dirichlet.append(np.round(np.std(f1_raw,axis=0),5))
f1_logratio_std_dirichlet.append(np.round(np.std(f1_logratio,axis=0),5))

In [60]:
columns_names = ['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2']
rows_names = ['Raw', 'Compositional', 'Logratio']

pt = PrettyTable(['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2'])
pt.title = 'RESULTS ON TECATOR DATA WITH DIRICHLET MODEL'
pt.add_row(['Raw',accuracy_raw_tot_dirichlet[0], logloss_raw_tot_dirichlet[0], np.round(np.mean(f1_raw_tot_dirichlet),5), rmse_raw_tot_dirichlet[0], np.round(np.mean(r2_raw_tot_dirichlet),5)])
pt.add_row(['Compositional',accuracy_compositional_tot_dirichlet[0], logloss_compositional_tot_dirichlet[0], np.round(np.mean(f1_compositional_tot_dirichlet),5), rmse_compositional_tot_dirichlet[0], np.round(np.mean(r2_compositional_tot_dirichlet),5)])
pt.add_row(['Logratio',accuracy_logratio_tot_dirichlet[0], logloss_logratio_tot_dirichlet[0], np.round(np.mean(f1_logratio_tot_dirichlet),5), rmse_logratio_tot_dirichlet[0], np.round(np.mean(r2_logratio_tot_dirichlet),5)])

print(pt)

+-------------------------------------------------------------------------+
|               RESULTS ON TECATOR DATA WITH DIRICHLET MODEL              |
+---------------+----------+---------------+----------+---------+---------+
|     Model     | Accuracy | Cross-entropy | F1-score |   RMSE  |    R2   |
+---------------+----------+---------------+----------+---------+---------+
|      Raw      | 0.95475  |    0.85184    | 0.84568  | 0.04759 | 0.70751 |
| Compositional | 0.93975  |    0.87708    | 0.80188  |  0.0721 | 0.32287 |
|    Logratio   | 0.93983  |    0.87762    | 0.80027  | 0.07234 |  0.3101 |
+---------------+----------+---------------+----------+---------+---------+


In [61]:
columns_names = ['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2']
rows_names = ['Raw', 'Compositional', 'Logratio']

pt = PrettyTable(['Model', 'Accuracy', 'Cross-entropy', 'F1-score', 'RMSE', 'R2'])
pt.title = 'STD ON TECATOR DATA WITH DIRICHLET MODEL'
pt.add_row(['Raw',accuracy_raw_std_dirichlet[0], logloss_raw_std_dirichlet[0], np.round(np.mean(f1_raw_std_dirichlet),5), rmse_raw_std_dirichlet[0], np.round(np.mean(r2_raw_std_dirichlet),5)])
pt.add_row(['Compositional',accuracy_compositional_std_dirichlet[0], logloss_compositional_std_dirichlet[0], np.round(np.mean(f1_compositional_std_dirichlet),5), rmse_compositional_std_dirichlet[0], np.round(np.mean(r2_compositional_std_dirichlet),5)])
pt.add_row(['Logratio',accuracy_logratio_std_dirichlet[0], logloss_logratio_std_dirichlet[0], np.round(np.mean(f1_logratio_std_dirichlet),5), rmse_logratio_std_dirichlet[0], np.round(np.mean(r2_logratio_std_dirichlet),5)])

print(pt)

+-------------------------------------------------------------------------+
|                 STD ON TECATOR DATA WITH DIRICHLET MODEL                |
+---------------+----------+---------------+----------+---------+---------+
|     Model     | Accuracy | Cross-entropy | F1-score |   RMSE  |    R2   |
+---------------+----------+---------------+----------+---------+---------+
|      Raw      | 0.00682  |    0.00278    | 0.03713  | 0.00349 | 0.04473 |
| Compositional | 0.01081  |    0.00475    |  0.0374  |  0.005  |  0.129  |
|    Logratio   | 0.01112  |    0.00569    | 0.04352  | 0.00579 | 0.17616 |
+---------------+----------+---------------+----------+---------+---------+


In [62]:
print(ttest_ind(accuracy_raw,accuracy_compositional))
print(ttest_ind(accuracy_raw,accuracy_logratio))

Ttest_indResult(statistic=10.903269328244843, pvalue=5.53984189090763e-22)
Ttest_indResult(statistic=10.137365512998015, pvalue=1.0170512912545477e-19)


In [63]:
print(ttest_ind(logloss_raw,logloss_compositional))
print(ttest_ind(logloss_raw,logloss_logratio))

Ttest_indResult(statistic=-45.37176198624103, pvalue=1.414826306224262e-106)
Ttest_indResult(statistic=-38.6211988844346, pvalue=3.9733887908369947e-94)


In [64]:
print(ttest_ind(rmse_raw,rmse_compositional))
print(ttest_ind(rmse_raw,rmse_logratio))

Ttest_indResult(statistic=-43.50301968714521, pvalue=2.749567574485415e-103)
Ttest_indResult(statistic=-38.02246008890222, pvalue=6.079386002033005e-93)


In [65]:
print(ttest_ind(np.mean(r2_raw,axis=1),np.mean(r2_compositional,axis=1)))
print(ttest_ind(np.mean(r2_raw,axis=1),np.mean(r2_logratio,axis=1)))

Ttest_indResult(statistic=33.665312860045546, pvalue=7.119674559250658e-84)
Ttest_indResult(statistic=23.131895193113607, pvalue=3.4691638663232745e-58)


In [66]:
print(ttest_ind(f1_raw,f1_compositional))
print(ttest_ind(f1_raw,f1_logratio))

Ttest_indResult(statistic=7.885254577757955, pvalue=2.067326969283283e-13)
Ttest_indResult(statistic=7.338862078190062, pvalue=5.4405332831564244e-12)
