In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

from functions.eval import *
from functions.cbi import *
from functions.utils import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
nruns = 10
imputer = MissForest(criterion = ('mse', 'gini'), random_state = 0)
missing_rates = np.arange(5)*0.2
test_size = 0.4

In [3]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/liver-disorders/bupa.data',
                  sep = ",", header = None)
print(data.head())
data = data.to_numpy()
X,y = data[:, [x for x in range(data.shape[1]) if x != 6]].astype(np.float32),data[:,-1]
G = len(np.unique(y))
le2 = LabelEncoder()
y = le2.fit_transform(y)
for g in range(G):
  print(sum(y==g))

X.shape

    0   1   2   3   4    5  6
0  85  92  45  27  31  0.0  1
1  85  64  59  32  23  0.0  2
2  86  54  33  16  54  0.0  2
3  91  78  34  24  36  0.0  2
4  87  70  12  28  10  0.0  2
145
200


(345, 6)

In [4]:
res_all = []
for missing_rate in missing_rates:
    res = np.array([one_run(X, y, imputer = imputer, classifier = RandomForestClassifier(),
                test_size = test_size, missing_rate = missing_rate, test_missing = True) for i in range(nruns)])
    res = np.array([np.mean(res, axis = 0), np.std(res, axis =0)]).flatten().round(3)
    print("result at missing rate", missing_rate, "\n", res)
    res_all.append(res)   

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
result at missing rate 0.0 
 [0.7   0.704 0.392 0.102 0.031 0.031 0.019 0.007]
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 

Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3

In [5]:
print(res_all)
res_latex = show_latex_results(res_all, missing_rates)
print("results on the liver dataset when missing data presents in the test set")
print(res_latex.to_latex(index = True, 
                       formatters = {"name": str.upper},
                      float_format = "{.1f}".format))
res_latex

[array([0.7  , 0.704, 0.392, 0.102, 0.031, 0.031, 0.019, 0.007]), array([0.649, 0.642, 5.379, 7.094, 0.022, 0.019, 1.778, 1.237]), array([0.587, 0.583, 5.658, 7.662, 0.046, 0.028, 1.421, 1.659]), array([0.541, 0.562, 5.715, 7.502, 0.041, 0.042, 1.594, 1.428]), array([0.554, 0.547, 5.704, 7.774, 0.025, 0.048, 1.685, 1.775])]
results on the liver dataset when missing data presents in the test set
\begin{tabular}{lllll}
\toprule
{} & \multicolumn{2}{l}{accuracy} & \multicolumn{2}{l}{running time} \\
missing rate &              CBI &             IClf &              CBI &             IClf \\
\midrule
0.0               &    0.7\$\textbackslash pm\$0.031 &  0.704\$\textbackslash pm\$0.031 &  0.392\$\textbackslash pm\$0.019 &  0.102\$\textbackslash pm\$0.007 \\
20.0              &  0.649\$\textbackslash pm\$0.022 &  0.642\$\textbackslash pm\$0.019 &  5.379\$\textbackslash pm\$1.778 &  7.094\$\textbackslash pm\$1.237 \\
40.0              &  0.587\$\textbackslash pm\$0.046 &  0.583\$\textbacksla

Unnamed: 0_level_0,accuracy,accuracy,running time,running time
missing rate,CBI,IClf,CBI,IClf
0.0,0.7$\pm$0.031,0.704$\pm$0.031,0.392$\pm$0.019,0.102$\pm$0.007
20.0,0.649$\pm$0.022,0.642$\pm$0.019,5.379$\pm$1.778,7.094$\pm$1.237
40.0,0.587$\pm$0.046,0.583$\pm$0.028,5.658$\pm$1.421,7.662$\pm$1.659
60.00000000000001,0.541$\pm$0.041,0.562$\pm$0.042,5.715$\pm$1.594,7.502$\pm$1.428
80.0,0.554$\pm$0.025,0.547$\pm$0.048,5.704$\pm$1.685,7.774$\pm$1.775


In [6]:
res_all = []
for missing_rate in missing_rates:
    res = np.array([one_run(X, y, imputer = imputer, classifier = RandomForestClassifier(),
                test_size = test_size, missing_rate = missing_rate, test_missing = False) for i in range(nruns)])
    res = np.array([np.mean(res, axis = 0), np.std(res, axis =0)]).flatten().round(3)
    print("result at missing rate", missing_rate, "\n", res)
    res_all.append(res)   

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
result at missing rate 0.0 
 [0.707 0.712 0.395 0.095 0.029 0.035 0.002 0.002]
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 

In [7]:
print(res_all)
res_latex = show_latex_results(res_all, missing_rates)
print("results on the liver dataset when the test set is fully observed")
print(res_latex.to_latex(index = True, 
                       formatters = {"name": str.upper},
                      float_format = "{.1f}".format))
res_latex

[array([0.707, 0.712, 0.395, 0.095, 0.029, 0.035, 0.002, 0.002]), array([0.687, 0.67 , 5.898, 3.664, 0.032, 0.015, 1.426, 0.886]), array([0.664, 0.624, 5.209, 4.223, 0.037, 0.037, 2.157, 0.565]), array([0.646, 0.632, 6.262, 4.115, 0.056, 0.049, 1.813, 1.322]), array([0.568, 0.565, 6.162, 2.558, 0.033, 0.046, 1.465, 1.065])]
results on the liver dataset when the test set is fully observed
\begin{tabular}{lllll}
\toprule
{} & \multicolumn{2}{l}{accuracy} & \multicolumn{2}{l}{running time} \\
missing rate &              CBI &             IClf &              CBI &             IClf \\
\midrule
0.0               &  0.707\$\textbackslash pm\$0.029 &  0.712\$\textbackslash pm\$0.035 &  0.395\$\textbackslash pm\$0.002 &  0.095\$\textbackslash pm\$0.002 \\
20.0              &  0.687\$\textbackslash pm\$0.032 &   0.67\$\textbackslash pm\$0.015 &  5.898\$\textbackslash pm\$1.426 &  3.664\$\textbackslash pm\$0.886 \\
40.0              &  0.664\$\textbackslash pm\$0.037 &  0.624\$\textbackslash pm\$

Unnamed: 0_level_0,accuracy,accuracy,running time,running time
missing rate,CBI,IClf,CBI,IClf
0.0,0.707$\pm$0.029,0.712$\pm$0.035,0.395$\pm$0.002,0.095$\pm$0.002
20.0,0.687$\pm$0.032,0.67$\pm$0.015,5.898$\pm$1.426,3.664$\pm$0.886
40.0,0.664$\pm$0.037,0.624$\pm$0.037,5.209$\pm$2.157,4.223$\pm$0.565
60.00000000000001,0.646$\pm$0.056,0.632$\pm$0.049,6.262$\pm$1.813,4.115$\pm$1.322
80.0,0.568$\pm$0.033,0.565$\pm$0.046,6.162$\pm$1.465,2.558$\pm$1.065
