In [31]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split
import torch
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score
import openml
from simple_model import DyadOneHotPairDataset, DyadRankingModel, create_dyads, ConformalPredictor, ConformalRankingPredictor, MCDyadOneHotPairDataset

In [32]:
dataset = openml.datasets.get_dataset(35)
X, y, _, _ = dataset.get_data(
    target=dataset.default_target_attribute, dataset_format="dataframe"
)

# Automatically identify categorical and numerical columns
categorical_features = X.select_dtypes(
    include=["object", "category"]
).columns.tolist()
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

num_classes = len(np.unique(y))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

# Encode labels
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Preprocessing for numerical data: Impute missing values, then scale
numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

# Preprocessing for categorical data: Impute missing values, then one-hot encode
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

if not isinstance(X_train, np.ndarray):
    X_train = X_train.toarray()
if not isinstance(y_train, np.ndarray):
    y_train = y_train.toarray()



  return func(*args, **kwargs)


In [33]:
from simple_model import ClassifierModel, DyadRankingModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from math import log2, ceil

# clf = ClassifierModel(input_dim = X_train.shape[1], hidden_dim=16, output_dim=y.max()+1)
rf = RandomForestClassifier()

clf = ClassifierModel(input_dim=X_train.shape[1], hidden_dim=16, output_dim=num_classes)

cp_net = ConformalPredictor(clf, alpha=0.05)
cp_rf = ConformalPredictor(rf, alpha=0.05)

crps = [ConformalRankingPredictor(num_classes=y_train.max()+1, alpha= 0.05) for i in range(4)]

cp_net.fit(X_train, y_train, num_epochs=100, random_state=1, patience=8)
cp_rf.fit(X_train, y_train)



crps[0].fit(X_train,y_train, random_state=1, use_cross_isntance_data=False, num_epochs=100, patience=8)
crps[1].fit(X_train,y_train, random_state=1, use_cross_isntance_data=True, num_epochs=100, patience=8, num_pairs=len(X_train))
crps[2].fit(X_train,y_train, random_state=1, use_cross_isntance_data=True, num_epochs=100, patience=8, num_pairs=len(X_train)*(num_classes-1))
crps[3].fit(X_train,y_train, random_state=1, use_cross_isntance_data=True, num_epochs=100, patience=8, num_pairs=len(X_train)*(num_classes-1)*ceil(log2(len(X_train))))

if not isinstance(X_test, np.ndarray):
    X_test = X_test.toarray()
if not isinstance(y_test, np.ndarray):
    y_test = y_test.toarray()

Stopping training.
Stopping training.


KeyboardInterrupt: 

In [30]:

# sets
pred_sets_clf = cp_net.predict_set(X_test)
pred_sets_rf = cp_rf.predict_set(X_test)

def evaluate_method(method):
    pred_sets = method.predict_set(X_test)
    y_test_model = method.model.predict(X_test)
    coverage = np.mean([y_test[i] in pred_sets[i] for i in range(len(y_test))])
    efficiency = np.mean([len(pred_sets[i]) for i in range(len(y_test))])
    print(f"Accuracy {accuracy_score(y_test_model, y_test)}")
    print(f"Coverage {coverage} efficiency {efficiency}")

# coverage_clf = np.mean([y_test[i] in pred_sets_clf[i] for i in range(len(y_test))])
# efficiency_clf = np.mean([len(pred_sets_clf[i]) for i in range(len(y_test))])

for crp in crps:
    evaluate_method(crp)
evaluate_method(cp_net)


# print(f"Coverage clf {coverage_clf} efficiency clf {efficiency_clf}")


Accuracy 0.972972972972973
Coverage 0.9054054054054054 efficiency 1.7837837837837838
Accuracy 0.972972972972973
Coverage 0.9054054054054054 efficiency 1.7837837837837838
Accuracy 0.972972972972973
Coverage 0.9054054054054054 efficiency 1.7837837837837838
Accuracy 0.972972972972973
Coverage 0.9054054054054054 efficiency 1.7837837837837838
Accuracy 1.0
Coverage 0.9594594594594594 efficiency 0.9594594594594594


In [18]:
y_train

array([4, 2, 1, 2, 2, 2, 4, 3, 5, 2, 0, 4, 1, 3, 1, 1, 2, 0, 1, 0, 0, 0,
       2, 0, 3, 5, 3, 3, 4, 1, 1, 0, 2, 0, 0, 0, 0, 4, 3, 1, 4, 1, 2, 2,
       3, 4, 2, 3, 4, 4, 3, 3, 1, 0, 2, 1, 0, 0, 2, 1, 2, 3, 5, 0, 2, 4,
       0, 1, 3, 3, 1, 0, 1, 3, 0, 0, 0, 5, 4, 4, 3, 4, 0, 3, 0, 2, 0, 4,
       5, 4, 3, 2, 5, 4, 2, 0, 1, 1, 0, 0, 0, 0, 1, 3, 3, 0, 1, 0, 0, 2,
       4, 3, 0, 0, 2, 5, 0, 4, 2, 3, 5, 2, 1, 1, 1, 1, 4, 1, 0, 0, 1, 0,
       1, 4, 0, 1, 5, 0, 4, 0, 1, 2, 1, 0, 2, 0, 2, 0, 2, 1, 0, 2, 2, 1,
       4, 0, 4, 2, 3, 1, 0, 0, 3, 0, 2, 2, 1, 0, 4, 0, 1, 3, 0, 1, 2, 1,
       0, 5, 0, 4, 0, 1, 3, 0, 0, 5, 2, 0, 1, 3, 4, 0, 0, 1, 0, 5, 2, 4,
       2, 0, 5, 0, 1, 4, 0, 4, 3, 1, 3, 2, 0, 4, 2, 3, 0, 0, 4, 2, 3, 0,
       4, 2, 5, 2, 0, 5, 2, 3, 0, 1, 2, 3, 5, 1, 4, 2, 1, 0, 0, 4, 0, 2,
       1, 3, 4, 0, 3, 5, 1, 1, 3, 0, 4, 2, 0, 0, 4, 4, 5, 2, 4, 0, 2, 0,
       2, 4, 0, 0, 2, 0, 0, 0, 2, 4, 2, 1, 2, 2, 0, 2, 3, 0, 5, 3, 2, 0,
       4, 4, 2, 2, 0, 1])

In [7]:
y

array([0, 1, 1, 2])

In [8]:
from simple_model import MCDyadOneHotPairDataset, DyadOneHotPairDataset
ds = DyadOneHotPairDataset(X, y, num_classes=3)
