In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

from functions.eval import *
from functions.cbi import *
from functions.utils import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
nruns = 10
imputer = MissForest(criterion = ('mse', 'gini'), random_state = 0)
missing_rates = np.arange(5)*0.2
test_size = 0.4

In [3]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data',
                  sep = ",")
data = data.drop(['name'], axis = 1)
X, y = data.drop(['status'], axis = 1), data['status']
print(np.array([sum(y==0), sum(y==1)]))
print(X.shape)
print(X.head())
X = X.to_numpy()
y = np.asarray(y)

[ 48 147]
(195, 22)
   MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  MDVP:Jitter(Abs)  \
0      119.992       157.302        74.997         0.00784           0.00007   
1      122.400       148.650       113.819         0.00968           0.00008   
2      116.682       131.111       111.555         0.01050           0.00009   
3      116.676       137.871       111.366         0.00997           0.00009   
4      116.014       141.781       110.655         0.01284           0.00011   

   MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  MDVP:Shimmer(dB)  ...  \
0   0.00370   0.00554     0.01109       0.04374             0.426  ...   
1   0.00465   0.00696     0.01394       0.06134             0.626  ...   
2   0.00544   0.00781     0.01633       0.05233             0.482  ...   
3   0.00502   0.00698     0.01505       0.05492             0.517  ...   
4   0.00655   0.00908     0.01966       0.06425             0.584  ...   

   MDVP:APQ  Shimmer:DDA      NHR     HNR      RPDE   

In [4]:
res_all = []
for missing_rate in missing_rates:
    res = np.array([one_run(X, y, imputer = imputer, classifier = RandomForestClassifier(),
                test_size = test_size, missing_rate = missing_rate, test_missing = True) for i in range(nruns)])
    res = np.array([np.mean(res, axis = 0), np.std(res, axis =0)]).flatten().round(3)
    print("result at missing rate", missing_rate, "\n", res)
    res_all.append(res)   

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
result at missing rate 0.0 
 [0.896 0.903 0.368 0.092 0.027 0.032 0.006 0.003]
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 

Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7

In [5]:
print(res_all)
res_latex = show_latex_results(res_all, missing_rates)
print("results on the parkinson dataset when missing data presents in the test set")
print(res_latex.to_latex(index = True, 
                       formatters = {"name": str.upper},
                      float_format = "{.1f}".format))
res_latex

[array([0.896, 0.903, 0.368, 0.092, 0.027, 0.032, 0.006, 0.003]), array([8.7700e-01, 8.6900e-01, 1.5456e+01, 2.7058e+01, 2.4000e-02,
       3.8000e-02, 4.6900e+00, 5.6670e+00]), array([ 0.862,  0.858, 18.927, 26.745,  0.045,  0.043,  3.824,  2.528]), array([ 0.795,  0.813, 18.949, 22.418,  0.035,  0.039,  4.077,  5.805]), array([ 0.778,  0.769, 18.711, 27.507,  0.037,  0.06 ,  4.516,  5.776])]
results on the parkinson dataset when missing data presents in the test set
\begin{tabular}{lllll}
\toprule
{} & \multicolumn{2}{l}{accuracy} & \multicolumn{2}{l}{running time} \\
missing rate &              CBI &             IClf &               CBI &              IClf \\
\midrule
0.0               &  0.896\$\textbackslash pm\$0.027 &  0.903\$\textbackslash pm\$0.032 &   0.368\$\textbackslash pm\$0.006 &   0.092\$\textbackslash pm\$0.003 \\
20.0              &  0.877\$\textbackslash pm\$0.024 &  0.869\$\textbackslash pm\$0.038 &   15.456\$\textbackslash pm\$4.69 &  27.058\$\textbackslash pm\$5.6

Unnamed: 0_level_0,accuracy,accuracy,running time,running time
missing rate,CBI,IClf,CBI,IClf
0.0,0.896$\pm$0.027,0.903$\pm$0.032,0.368$\pm$0.006,0.092$\pm$0.003
20.0,0.877$\pm$0.024,0.869$\pm$0.038,15.456$\pm$4.69,27.058$\pm$5.667
40.0,0.862$\pm$0.045,0.858$\pm$0.043,18.927$\pm$3.824,26.745$\pm$2.528
60.00000000000001,0.795$\pm$0.035,0.813$\pm$0.039,18.949$\pm$4.077,22.418$\pm$5.805
80.0,0.778$\pm$0.037,0.769$\pm$0.06,18.711$\pm$4.516,27.507$\pm$5.776


In [6]:
res_all = []
for missing_rate in missing_rates:
    res = np.array([one_run(X, y, imputer = imputer, classifier = RandomForestClassifier(),
                test_size = test_size, missing_rate = missing_rate, test_missing = False) for i in range(nruns)])
    res = np.array([np.mean(res, axis = 0), np.std(res, axis =0)]).flatten().round(3)
    print("result at missing rate", missing_rate, "\n", res)
    res_all.append(res)   

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
result at missing rate 0.0 
 [0.904 0.895 0.368 0.092 0.037 0.045 0.006 0.003]
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 

In [7]:
print(res_all)
res_latex = show_latex_results(res_all, missing_rates)
print("results on the parkinson dataset when the test set is fully observed")
print(res_latex.to_latex(index = True, 
                       formatters = {"name": str.upper},
                      float_format = "{.1f}".format))
res_latex

[array([0.904, 0.895, 0.368, 0.092, 0.037, 0.045, 0.006, 0.003]), array([ 0.863,  0.871, 16.72 , 15.741,  0.041,  0.05 ,  3.905,  4.267]), array([ 0.865,  0.862, 18.888, 13.52 ,  0.026,  0.033,  4.622,  1.636]), array([ 0.873,  0.847, 16.582, 10.715,  0.032,  0.045,  5.786,  3.629]), array([ 0.79 ,  0.799, 19.093, 13.792,  0.071,  0.05 ,  5.147,  5.38 ])]
results on the parkinson dataset when the test set is fully observed
\begin{tabular}{lllll}
\toprule
{} & \multicolumn{2}{l}{accuracy} & \multicolumn{2}{l}{running time} \\
missing rate &              CBI &             IClf &               CBI &              IClf \\
\midrule
0.0               &  0.904\$\textbackslash pm\$0.037 &  0.895\$\textbackslash pm\$0.045 &   0.368\$\textbackslash pm\$0.006 &   0.092\$\textbackslash pm\$0.003 \\
20.0              &  0.863\$\textbackslash pm\$0.041 &   0.871\$\textbackslash pm\$0.05 &   16.72\$\textbackslash pm\$3.905 &  15.741\$\textbackslash pm\$4.267 \\
40.0              &  0.865\$\textbacksla

Unnamed: 0_level_0,accuracy,accuracy,running time,running time
missing rate,CBI,IClf,CBI,IClf
0.0,0.904$\pm$0.037,0.895$\pm$0.045,0.368$\pm$0.006,0.092$\pm$0.003
20.0,0.863$\pm$0.041,0.871$\pm$0.05,16.72$\pm$3.905,15.741$\pm$4.267
40.0,0.865$\pm$0.026,0.862$\pm$0.033,18.888$\pm$4.622,13.52$\pm$1.636
60.00000000000001,0.873$\pm$0.032,0.847$\pm$0.045,16.582$\pm$5.786,10.715$\pm$3.629
80.0,0.79$\pm$0.071,0.799$\pm$0.05,19.093$\pm$5.147,13.792$\pm$5.38
