# Packages

In [1]:
### Import Packages ###
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# [CarEvaluation](https://github.com/ubc-systopia/treeFarms/blob/main/experiments/datasets/car_evaluation/data.csv)

In [7]:
CarEvaluation = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/CarEvaluation.csv")
CarEvaluation.rename(columns={'class_1': 'Y'}, inplace=True)

with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/CarEvaluation.pkl', 'wb') as file:
        pickle.dump(CarEvaluation, file)

# [Bar7](https://github.com/ubc-systopia/treeFarms/blob/main/experiments/datasets/coupon/bar-7.csv)

In [6]:
### Proces ###
Bar7 = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/bar-7.csv")

## Save ###
with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/Bar7.pkl', 'wb') as file:
    pickle.dump(Bar7, file)


# [Breast Cancer](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic)

In [3]:
### Import ###
BreastCancer = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
BreastCancer.columns = [
    'Sample', 'ClumpThickness', 'CellSize', 
    'CellShape', 'Adhesion', 
    'SingleEpithelialCellSize', 'BareNuclei', 'BlandChromatin', 
    'NormalNucleoli', 'Mitoses', 'Class'
]

### Process ###
BreastCancer.replace('?', pd.NA, inplace=True)
BreastCancer = BreastCancer.apply(pd.to_numeric, errors='coerce')

### Threshold ###
thresholds = {
    'ClumpThickness': [10],
    'CellSize': [1, 10],
    'CellShape': [1],
    'Adhesion': [1],
    'SingleEpithelialCellSize': [2],
    'BareNuclei': [1, 10],
    'NormalNucleoli': [1, 10]
}

### One-hot Encode ###
one_hot_columns = []
for feature, values in thresholds.items():
    for value in values:
        new_col = f"{feature.replace(' ', '_')}_{value}"
        BreastCancer[new_col] = (BreastCancer[feature] == value).astype(int)
        one_hot_columns.append(new_col)

### Select Columns ###
selected_columns = [f"{feature.replace(' ', '_')}_{value}" 
                    for feature, values in thresholds.items() 
                    for value in values] + ['Class']
BreastCancer = BreastCancer[selected_columns]
BreastCancer['Class'] = BreastCancer['Class'].replace({2: 0, 4: 1})
BreastCancer.rename(columns={'Class': 'Y'}, inplace=True)

Unnamed: 0,ClumpThickness_10,CellSize_1,CellSize_10,CellShape_1,Adhesion_1,SingleEpithelialCellSize_2,BareNuclei_1,BareNuclei_10,NormalNucleoli_1,NormalNucleoli_10,Y
0,0,1,0,1,1,1,1,0,1,0,0
1,0,0,0,0,0,0,0,1,0,0,0
2,0,1,0,1,1,1,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,0
4,0,1,0,1,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
694,0,1,0,1,1,0,0,0,1,0,0
695,0,1,0,1,1,1,1,0,1,0,0
696,0,0,1,0,0,0,0,0,0,1,1
697,0,0,0,0,0,0,0,0,0,0,1


In [33]:
with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/BreastCancer.pkl', 'wb') as file:
        pickle.dump(BreastCancer, file)

# [COMPAS](https://github.com/ubc-systopia/treeFarms/tree/main/experiments/datasets/compas)

In [4]:
### Proces ###
COMPAS = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/COMPAS.csv")
COMPAS.rename(columns={'recidivate-within-two-years:1': 'Y'}, inplace=True)

## Save ###
with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/COMPAS.pkl', 'wb') as file:
    pickle.dump(COMPAS, file)


# [FICO](https://github.com/ubc-systopia/treeFarms/tree/main/experiments/datasets/fico)

In [5]:
FICO = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/fico-binary.csv")
FICO.rename(columns={'RiskPerform': 'Y'}, inplace=True)

with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/FICO.pkl', 'wb') as file:
        pickle.dump(FICO, file)

# [MONK](https://github.com/ubc-systopia/treeFarms/tree/main/experiments/datasets)

In [3]:
### Read in columns ###
MONK1 = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/MONK1.csv",delimiter=",", header = 0)
MONK2 = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/MONK2.csv",delimiter=",", header = 0)
MONK3 = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/MONK3.csv",delimiter=",", header = 0)

### Rename columns ###
MONK1.rename(columns={'class_1': 'Y'}, inplace=True)
MONK2.rename(columns={'class_1': 'Y'}, inplace=True)
MONK3.rename(columns={'class_1': 'Y'}, inplace=True)

### Change to categorical ###
MONK1 = MONK1.astype('bool')
MONK2 = MONK2.astype('bool')
MONK3 = MONK3.astype('bool')

### Move columns ###
MONK1 = MONK1.reindex(columns=['Y', 'a1_1', 'a1_2', 'a2_1', 'a2_2', 'a3_1', 'a4_1', 'a4_2', 'a5_1', 'a5_2', 'a5_3', 'a6_1',])
MONK2 = MONK2.reindex(columns=['Y', 'a1_1', 'a1_2', 'a2_1', 'a2_2', 'a3_1', 'a4_1', 'a4_2', 'a5_1', 'a5_2', 'a5_3', 'a6_1',])
MONK3 = MONK3.reindex(columns=['Y', 'a1_1', 'a1_2', 'a2_1', 'a2_2', 'a3_1', 'a4_1', 'a4_2', 'a5_1', 'a5_2', 'a5_3', 'a6_1',])

In [4]:
MONK3

Unnamed: 0,Y,a1_1,a1_2,a2_1,a2_2,a3_1,a4_1,a4_2,a5_1,a5_2,a5_3,a6_1
0,True,True,False,True,False,True,True,False,True,False,False,False
1,True,True,False,True,False,True,True,False,False,True,False,True
2,True,True,False,True,False,True,True,False,False,True,False,False
3,False,True,False,True,False,True,True,False,False,False,True,True
4,False,True,False,True,False,True,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
117,False,False,False,False,False,False,False,True,False,True,False,False
118,False,False,False,False,False,False,False,True,False,False,True,False
119,False,False,False,False,False,False,False,False,True,False,False,True
120,False,False,False,False,False,False,False,False,False,False,True,False


In [5]:
### Save files ###
with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/MONK1.pkl', 'wb') as file:
        pickle.dump(MONK1, file)

with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/MONK2.pkl', 'wb') as file:
        pickle.dump(MONK2, file)

with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/MONK3.pkl', 'wb') as file:
        pickle.dump(MONK3, file)

# Iris

In [2]:
### Load Iris Data ###
from sklearn.datasets import load_iris
iris = load_iris()

### Covariates ###
X = pd.DataFrame(iris['data'])
X.columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"]

### Response ###
y = pd.DataFrame(iris["target"])
y.columns = ["Y"]
y['Y'] = y['Y'].astype("category")

### Discretize data ###
labels = [1,2,3]
X["SepalLength"] = pd.qcut(X["SepalLength"], len(labels), labels=labels)
X["SepalWidth"] = pd.qcut(X["SepalWidth"], len(labels), labels=labels)
X["PetalLength"] = pd.qcut(X["PetalLength"], len(labels), labels=labels)
X["PetalWidth"] = pd.qcut(X["PetalWidth"], len(labels), labels=labels)

### One-hot encoding ###
categorical_columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"]
encoder = OneHotEncoder(sparse_output=False, drop=None) 
encoded = encoder.fit_transform(X[categorical_columns])
encoded_columns = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded, columns=encoded_columns)
Iris_OneHot = pd.concat([encoded_df, y["Y"].reset_index(drop=True)], axis=1)

# ### Binary Classifcation ###
# Iris_OneHot = Iris_OneHot.loc[(Iris_OneHot["Y"] == 0) | (Iris_OneHot["Y"] == 1)]

In [3]:
with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/Iris.pkl', 'wb') as file:
        pickle.dump(Iris_OneHot, file)