In [41]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
dry_bean = fetch_ucirepo(id=602) 

In [49]:
# data (as pandas dataframes) 
beans = dry_bean.data.original
beans.head()


Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,5
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,5
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,5
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,5
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,5


In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB

In [46]:
#labels are characters, but should be integers for sklearn (this is a bad model, but it's just for demonstration)


label_encoder = LabelEncoder()
beans["Class"]  = label_encoder.fit_transform(beans["Class"])

# data (as pandas dataframes) 
y = beans["Class"]
X = beans.drop(columns=["Class"], axis = 1)

#split the data, train size is 10000
X_train, X_rest1, y_train, y_rest1 = train_test_split(X, y, train_size=10000, random_state=2)

#split of the test data
X_test, X_rest2, y_test, y_rest2 = train_test_split(X_rest1, y_rest1, train_size=1000, random_state=2)

#split the remaining calibration and "new" data
X_calib, X_new, y_calib, y_new = train_test_split(X_rest2, y_rest2, train_size=1000, random_state=2)

#fit the model
model = GaussianNB()
model.fit(X_train, y_train)



In [47]:
#plot the results
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

#Check accuracy
y_pred = model.predict(X_test)



In [48]:
print("Accuracy: ", np.mean(y_pred == y_test))

cm = confusion_matrix(y_test, y_pred)
print(pd.DataFrame(cm, index = label_encoder.classes_, columns = label_encoder.classes_))

#rows are true, columns are predicted
#for example 34ish Cali were classified as Barbunya

Accuracy:  0.773
          BARBUNYA  BOMBAY  CALI  DERMASON  HOROZ  SEKER  SIRA
BARBUNYA        44       0    34         0     10      0     7
BOMBAY           0      37     0         0      0      0     0
CALI            19       0    87         0      7      0     0
DERMASON         0       0     0       210      0     24    12
HOROZ            5       0     9         3    132      0    20
SEKER            0       0     0        19      1    115    12
SIRA             0       0     0        10     13     22   148


# NAIVE APPROACH 
Doesn't lead to correct coverage

In [53]:
#Get probabilities from the model
predicitons = model.predict_proba(X_calib)
high_prob_predictions = np.amax(predicitons, axis = 1)
high_p_beans = np.where(high_prob_predictions >= 0.95)


#count how often the right label is hit
its_a_match = (model.predict(X_calib) == y_calib)
coverage = np.mean(its_a_match.values[high_p_beans])

print("Coverage: ", coverage)

Coverage:  0.9085072231139647


# Conformal Prediction 😎😎

In [57]:
n = len(X_calib)
n

1000

In [58]:
predicitons = model.predict_proba(X_calib)
prob_true_class = predicitons[np.arange(n), y_calib]



In [60]:

#turn into a score 


array([9.99447119e-01, 9.38064603e-01, 9.07403754e-01, 9.70339523e-01,
       9.98870571e-01, 4.18474857e-01, 9.99372113e-01, 9.03566440e-01,
       9.99999981e-01, 9.98353996e-01, 9.98187993e-01, 5.77464036e-01,
       4.40572846e-01, 7.40879140e-01, 9.99999954e-01, 9.77080112e-01,
       5.92638999e-02, 1.00000000e+00, 9.99998135e-01, 9.99981106e-01,
       9.98580375e-01, 9.99756151e-01, 9.77721612e-01, 8.12262027e-07,
       9.40942631e-01, 9.99999997e-01, 9.44207791e-01, 1.78908384e-01,
       8.64279082e-01, 1.07904746e-02, 1.00000000e+00, 7.83195241e-01,
       9.99582166e-01, 9.99999999e-01, 9.95813794e-01, 9.97818176e-01,
       9.99950744e-01, 8.21669088e-01, 6.32100398e-01, 3.18591713e-01,
       1.00000000e+00, 3.96485244e-02, 9.98646874e-01, 1.00000000e+00,
       9.62907987e-01, 1.00000000e+00, 6.88450567e-01, 1.00000000e+00,
       9.99999999e-01, 3.99127349e-02, 8.47082499e-01, 9.99999988e-01,
       1.98302064e-01, 9.99944301e-01, 9.73339782e-01, 3.99816233e-01,
      