In [41]:
import numpy as np
from sklearn.datasets import make_classification, load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split
import torch
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score
import openml
from simple_model import DyadOneHotPairDataset, DyadRankingModel, create_dyads, ConformalPredictor, ConformalRankingPredictor, MCDyadOneHotPairDataset

In [42]:
dataset = openml.datasets.get_dataset(1005)
X, y, _, _ = dataset.get_data(
    target=dataset.default_target_attribute, dataset_format="dataframe"
)

# Automatically identify categorical and numerical columns
categorical_features = X.select_dtypes(
    include=["object", "category"]
).columns.tolist()
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

num_classes = len(np.unique(y))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=3
)

# Encode labels
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Preprocessing for numerical data: Impute missing values, then scale
numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

# Preprocessing for categorical data: Impute missing values, then one-hot encode
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train = preprocessor.fit_transform(X_train)


  return func(*args, **kwargs)


In [43]:
from simple_model import ClassifierModel, DyadRankingModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


# clf = ClassifierModel(input_dim = X_train.shape[1], hidden_dim=16, output_dim=y.max()+1)
clf = RandomForestClassifier()

cp = ConformalPredictor(clf, alpha=0.05)

if not isinstance(X_train, np.ndarray):
    X_train = X_train.to_numpy()
if not isinstance(y_train, np.ndarray):
    y_train = y_train.to_numpy()
cp.fit(X_train, y_train)
pred_sets_clf = cp.predict_set(X_test)

clf.fit(X_train,y_train)

crp = ConformalRankingPredictor(num_classes=y_train.max()+1, alpha= 0.05)
crp.fit(X_train,y_train, random_state=1, use_cross_isntance_data=True, num_pairs=20000)

# pred_sets_rnk = crp.predict_set(X_test)




[(14, 18), (53, 35), (41, 19), (111, 72), (41, 44), (51, 107), (9, 42), (100, 27), (68, 110), (68, 88), (99, 8), (84, 69), (8, 47), (72, 3), (39, 86), (16, 99), (65, 44), (47, 38), (25, 22), (28, 57), (91, 74), (11, 3), (77, 96), (78, 19), (89, 29), (108, 0), (76, 11), (56, 81), (60, 85), (33, 46), (105, 81), (16, 90), (99, 105), (20, 42), (98, 32), (80, 106), (33, 97), (36, 83), (106, 3), (106, 69), (65, 60), (75, 46), (44, 97), (3, 44), (6, 45), (34, 112), (41, 38), (70, 65), (103, 111), (47, 56), (72, 25), (84, 82), (51, 30), (34, 11), (7, 54), (106, 19), (31, 77), (1, 78), (60, 21), (50, 84), (46, 39), (43, 46), (10, 93), (31, 87), (4, 38), (9, 2), (3, 42), (110, 11), (28, 61), (91, 77), (14, 45), (1, 11), (93, 47), (66, 98), (79, 50), (60, 74), (24, 109), (29, 7), (95, 17), (45, 71), (23, 107), (81, 71), (91, 2), (37, 29), (30, 71), (74, 63), (35, 82), (49, 59), (5, 18), (84, 103), (98, 110), (34, 31), (85, 36), (109, 113), (12, 35), (112, 89), (13, 98), (109, 104), (87, 65), (73,

In [44]:

if not isinstance(X_test, np.ndarray):
    X_test = X_test.to_numpy()
if not isinstance(y_test, np.ndarray):
    y_test = y_test.to_numpy()



coverage_clf = np.mean([y_test[i] in pred_sets_clf[i] for i in range(len(y_test))])
efficiency_clf = np.mean([len(pred_sets_clf[i]) for i in range(len(y_test))])

# coverage_rnk = np.mean([y_test[i] in pred_sets_rnk[i] for i in range(len(y_test))])
# efficiency_rnk = np.mean([len(pred_sets_rnk[i]) for i in range(len(y_test))])
y_test_clf = cp.model.predict(X_test)
y_test_rnk = crp.model.predict(X_test)
y_test_rf = clf.predict(X_test)

print(f"Accuracy clf {accuracy_score(y_test_clf, y_test)}")
print(f"Accuracy rf {accuracy_score(y_test_rf, y_test)}")
print(f"Accuracy rnk {accuracy_score(y_test_rnk, y_test)}")

# print(f"Coverage clf {coverage_clf} efficiency clf {efficiency_clf}")
# print(f"Coverage rnk {coverage_rnk} efficiency rnk {efficiency_rnk}")


Accuracy clf 0.6547619047619048
Accuracy rf 0.6547619047619048
Accuracy rnk 0.6976744186046512




In [45]:
y_test

array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1])