In [2]:
### Import Packages ###
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

<!-- # [Auto MPG](https://archive.ics.uci.edu/dataset/9/auto+mpg) -->

<!-- Response: mpg (continuous)

Covariates: displacement, Y, cylinders, horsepower, weight, acceleration, model_year, origin -->

In [None]:
### Set up ###
labels = ["low", "medium", "high"]

### Import data ###
Auto = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/Auto.data", delim_whitespace= True, header=None)
Auto.columns = ["Y", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year", "origin", "name"]
Auto.drop('name', axis=1, inplace=True)
Auto = Auto[Auto["horsepower"] != "?"]
Auto = Auto.dropna()

In [None]:
### Discretize data ###
labels = [1,2,3]
Auto["displacement"] = pd.qcut(Auto["displacement"], len(labels), labels=labels)
Auto["cylinders"] = pd.qcut(Auto["cylinders"], len(labels), labels=labels)
Auto["horsepower"] = pd.qcut(pd.to_numeric(Auto["horsepower"]), len(labels), labels=labels)
Auto["weight"] = pd.qcut(Auto["weight"], len(labels), labels=labels)
Auto["acceleration"] = pd.qcut(Auto["acceleration"], len(labels), labels=labels)
Auto["model_year"] = pd.qcut(Auto["model_year"], len(labels), labels=labels)
Auto["origin"] = pd.Categorical(Auto["origin"])

### One-hot encoding ###
categorical_columns = ["displacement", "cylinders", "horsepower", "weight", "acceleration", "model_year", "origin"]
encoder = OneHotEncoder(sparse_output=False, drop=None) 
encoded = encoder.fit_transform(Auto[categorical_columns])
encoded_columns = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded, columns=encoded_columns)
Auto_OneHot = pd.concat(encoded_df, [Auto["Y"].reset_index(drop=True)], axis=1)


In [None]:
with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/AutoBinned.pkl', 'wb') as file:
        pickle.dump(Auto, file)

<!-- # [Boston Housing](/Users/simondn/Documents/RashomonActiveLearning/Data/raw/BostonHousing.data) -->

In [3]:
### Import ###
BostonHousing = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/BostonHousing.data", header = None, sep='\s+')
BostonHousing.columns = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]
BostonHousing.rename(columns={'MEDV': 'Y'}, inplace=True)

### Discretize data ###
labels = [1,2,3]
BostonHousing["CRIM"] = pd.qcut(BostonHousing["CRIM"], len(labels), labels=labels)
BostonHousing["ZN"] = pd.Categorical(BostonHousing["ZN"])
BostonHousing["INDUS"] = pd.qcut(pd.to_numeric(BostonHousing["INDUS"]), len(labels), labels=labels)
BostonHousing["CHAS"] = pd.Categorical(BostonHousing["CHAS"])
BostonHousing["NOX"] = pd.qcut(BostonHousing["NOX"], len(labels), labels=labels)
BostonHousing["RM"] = pd.qcut(BostonHousing["RM"], len(labels), labels=labels)
BostonHousing["AGE"] = pd.qcut(BostonHousing["AGE"], len(labels), labels=labels)
BostonHousing["DIS"] = pd.qcut(BostonHousing["DIS"], len(labels), labels=labels)
BostonHousing["RAD"] = pd.qcut(BostonHousing["RAD"], len(labels), labels=labels)
BostonHousing["TAX"] = pd.qcut(BostonHousing["TAX"], len(labels), labels=labels)
BostonHousing["PTRATIO"] = pd.qcut(BostonHousing["PTRATIO"], len(labels), labels=labels)
BostonHousing["B"] = pd.qcut(BostonHousing["B"], len(labels), labels=labels)
BostonHousing["LSTAT"] = pd.qcut(BostonHousing["LSTAT"], len(labels), labels=labels)
BostonHousing["Y"] = pd.qcut(BostonHousing["Y"], 2, labels=[1,2])
BostonHousing["Y"] = BostonHousing["Y"] >= np.quantile(BostonHousing["Y"], 0.75)

### Filter out ###
# KeepColumns = ["CRIM", "NOX", "RM", "PTRATIO", "LSTAT", "Y"] # Top 5
KeepColumns = ["CRIM", "NOX", "RM", "PTRATIO", "LSTAT", "TAX", "DIS", "Y"] # Top 7
# KeepColumns = ["CRIM", "NOX", "RM", "PTRATIO", "LSTAT", "TAX", "DIS", "AGE", "RAD", "Y"] # Top 9
BostonHousing = BostonHousing[KeepColumns]

## One-hot encoding ###
categorical_columns = KeepColumns.copy()
categorical_columns.remove("Y")
encoder = OneHotEncoder(sparse_output=False, drop=None) 
encoded = encoder.fit_transform(BostonHousing[categorical_columns])
encoded_columns = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded, columns=encoded_columns)
BostonHousing_OneHot = pd.concat([encoded_df, BostonHousing["Y"].reset_index(drop=True)], axis=1)

In [7]:
BostonHousing_OneHot.head()

Unnamed: 0,CRIM_1,CRIM_2,CRIM_3,NOX_1,NOX_2,NOX_3,RM_1,RM_2,RM_3,PTRATIO_1,...,LSTAT_1,LSTAT_2,LSTAT_3,TAX_1,TAX_2,TAX_3,DIS_1,DIS_2,DIS_3,Y
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,True
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,True
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,True
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,True
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,True


In [None]:
# from treeFarms.treefarms.model.treefarms import TREEFARMS
# TreeFarmsModel = TREEFARMS({"regularization": 0.01, "rashomon_bound_multiplier": 0.05})
# TreeFarmsModel.fit(BostonHousing_OneHot.loc[:, BostonHousing_OneHot.columns != "Y"], BostonHousing_OneHot["Y"])

In [4]:
with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/BostonHousingBinned7.pkl', 'wb') as file:
    pickle.dump(BostonHousing_OneHot, file)


<!-- # [COMPAS](https://github.com/ubc-systopia/treeFarms/tree/main/experiments/datasets/compas) -->

In [None]:
# with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/BostonHousingBinned.pkl', 'rb') as file:
#     test1 = pickle.load(file)

In [None]:
# with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/MONK1.pkl', 'rb') as file:
#     test1 = pickle.load(file).dropna()

In [None]:
# COMPAS = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/treeFarms/experiments/datasets/compas/binned.csv")
# COMPAS.rename(columns={'recidivate-within-two-years:1': 'Y'}, inplace=True)

# ### Save ###
# # with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/COMPAS.pkl', 'wb') as file:
#     # pickle.dump(COMPAS, file)


sex:Female                  int64
age:<21                     int64
age:<23                     int64
age:<26                     int64
age:<46                     int64
juvenile-felonies:=0        int64
juvenile-misdemeanors:=0    int64
juvenile-crimes:=0          int64
priors:=0                   int64
priors:=1                   int64
priors:2-3                  int64
priors:>3                   int64
Y                           int64
dtype: object

<!-- # [MONK](https://github.com/ubc-systopia/treeFarms/tree/main/experiments/datasets)  -->

In [None]:
# import pickle
# import pandas as pd

In [None]:
# ### Read in columns ###
# MONK1 = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/MONK1.csv",delimiter=",", header = 0)
# MONK2 = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/MONK2.csv",delimiter=",", header = 0)
# MONK3 = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/MONK3.csv",delimiter=",", header = 0)

# ### Rename columns ###
# MONK1.rename(columns={'class_1': 'Y'}, inplace=True)
# MONK2.rename(columns={'class_1': 'Y'}, inplace=True)
# MONK3.rename(columns={'class_1': 'Y'}, inplace=True)

# # ### Change to categorical ###
# # MONK1 = MONK1.astype('bool')
# # MONK2 = MONK2.astype('bool')
# # MONK3 = MONK3.astype('bool')

# ### Move columns ###
# MONK1 = MONK1.reindex(columns=['Y', 'a1_1', 'a1_2', 'a2_1', 'a2_2', 'a3_1', 'a4_1', 'a4_2', 'a5_1', 'a5_2', 'a5_3', 'a6_1',])
# MONK2 = MONK2.reindex(columns=['Y', 'a1_1', 'a1_2', 'a2_1', 'a2_2', 'a3_1', 'a4_1', 'a4_2', 'a5_1', 'a5_2', 'a5_3', 'a6_1',])
# MONK3 = MONK3.reindex(columns=['Y', 'a1_1', 'a1_2', 'a2_1', 'a2_2', 'a3_1', 'a4_1', 'a4_2', 'a5_1', 'a5_2', 'a5_3', 'a6_1',])

In [None]:
# ### Save files ###
# with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/MONK1.pkl', 'wb') as file:
#         pickle.dump(MONK1, file)

# with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/MONK2.pkl', 'wb') as file:
#         pickle.dump(MONK2, file)

# with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/MONK3.pkl', 'wb') as file:
#         pickle.dump(MONK3, file)

In [None]:
# with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/MONK1.pkl', 'rb') as file:
#     test1 = pickle.load(file).dropna()

# with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/MONK2.pkl', 'rb') as file:
#     test2 = pickle.load(file).dropna()

# with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/MONK3.pkl', 'rb') as file:
#     test3 = pickle.load(file).dropna()

<!-- # Iris -->

In [None]:
# ### Load Iris Data ###
# from sklearn.datasets import load_iris
# iris = load_iris()

# ### Covariates ###
# X = pd.DataFrame(iris['data'])
# X.columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"]

# ### Response ###
# y = pd.DataFrame(iris["target"])
# y.columns = ["Y"]
# y['Y'] = y['Y'].astype("category")

# ### Discretize data ###
# labels = [1,2,3]
# X["SepalLength"] = pd.qcut(X["SepalLength"], len(labels), labels=labels)
# X["SepalWidth"] = pd.qcut(X["SepalWidth"], len(labels), labels=labels)
# X["PetalLength"] = pd.qcut(X["PetalLength"], len(labels), labels=labels)
# X["PetalWidth"] = pd.qcut(X["PetalWidth"], len(labels), labels=labels)

# ### One-hot encoding ###
# categorical_columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"]
# encoder = OneHotEncoder(sparse_output=False, drop=None) 
# encoded = encoder.fit_transform(X[categorical_columns])
# encoded_columns = encoder.get_feature_names_out(categorical_columns)
# encoded_df = pd.DataFrame(encoded, columns=encoded_columns)
# Iris_OneHot = pd.concat([encoded_df, y["Y"].reset_index(drop=True)], axis=1)

In [None]:
with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/Iris.pkl', 'wb') as file:
        pickle.dump(Iris_OneHot, file)

<!-- # [Heart Disease](https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset?resource=download) -->

In [None]:
# HeartDisease = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/heart.csv")
# HeartDisease.rename(columns={'target': 'Y'}, inplace=True)

In [None]:
# ### Discretize data ###
# labels = [1,2,3]
# HeartDisease["age"] = pd.qcut(HeartDisease["age"], len(labels), labels=labels)
# HeartDisease["sex"] = pd.Categorical(HeartDisease["sex"])
# HeartDisease["cp"] = pd.Categorical(HeartDisease["cp"])
# HeartDisease["trestbps"] = pd.qcut(HeartDisease["trestbps"], len(labels), labels=labels)
# HeartDisease["chol"] = pd.qcut(HeartDisease["chol"], len(labels), labels=labels)
# HeartDisease["fbs"] = pd.Categorical(HeartDisease["fbs"])
# HeartDisease["restecg"] = pd.Categorical(HeartDisease["restecg"])
# HeartDisease["thalach"] = pd.qcut(HeartDisease["thalach"], len(labels), labels=labels)
# HeartDisease["exang"] = pd.Categorical(HeartDisease["exang"])
# HeartDisease["oldpeak"] = pd.qcut(HeartDisease["oldpeak"], len(labels), labels=labels)
# HeartDisease["slope"] = pd.Categorical(HeartDisease["slope"])
# HeartDisease["ca"] = pd.Categorical(HeartDisease["ca"])
# HeartDisease["thal"] = pd.Categorical(HeartDisease["thal"])
# HeartDisease["Y"] = pd.Categorical(HeartDisease["Y"])


In [None]:
# ### One-hot encoding ###
# categorical_columns = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal"]
# encoder = OneHotEncoder(sparse_output=False, drop=None) 
# encoded = encoder.fit_transform(HeartDisease[categorical_columns])
# encoded_columns = encoder.get_feature_names_out(categorical_columns)
# encoded_df = pd.DataFrame(encoded, columns=encoded_columns)
# HeartDisease_OneHot = pd.concat([encoded_df, HeartDisease["Y"].reset_index(drop=True)], axis=1)

In [None]:
from treeFarms.treefarms.model.treefarms import TREEFARMS
TreeFarmsModel = TREEFARMS({"regularization": 0.01, "rashomon_bound_multiplier": 0.05})
TreeFarmsModel.fit(HeartDisease_OneHot.loc[:, HeartDisease_OneHot.columns != "Y"], HeartDisease_OneHot["Y"])

: 

: 

<!-- # [Pima Indians Diabetes](https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database) -->

In [None]:
# ### Load data ###
# PimaIndians = pd.read_csv("/Users/simondn/Documents/RashomonActiveLearning/Data/raw/PimaIndiansDiabetes.csv")
# PimaIndians.rename(columns={'Outcome': 'Y'}, inplace=True)

# ### Discretize data ###
# labels = [1,2,3]
# PimaIndians["Pregnancies"] = pd.qcut(PimaIndians["Pregnancies"], len(labels), labels=labels)
# PimaIndians["Glucose"] = pd.qcut(PimaIndians["Glucose"], len(labels), labels=labels)
# PimaIndians["BloodPressure"] = pd.qcut(PimaIndians["BloodPressure"], len(labels), labels=labels)
# PimaIndians["SkinThickness"] = pd.qcut(PimaIndians["SkinThickness"], len(labels), labels=labels)
# PimaIndians["Insulin"] = (PimaIndians["Insulin"] == 0)
# PimaIndians["Insulin"] = PimaIndians["Insulin"].astype(int)
# PimaIndians["Insulin"] = pd.Categorical(PimaIndians["Insulin"])
# PimaIndians["DiabetesPedigreeFunction"] = pd.qcut(PimaIndians["DiabetesPedigreeFunction"], len(labels), labels=labels)
# PimaIndians["Age"] = pd.qcut(PimaIndians["Age"], len(labels), labels=labels)
# PimaIndians["Y"] = pd.Categorical(PimaIndians["Y"])

In [None]:
# ### One-hot encoding ###
# categorical_columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "DiabetesPedigreeFunction", "Age",]
# encoder = OneHotEncoder(sparse_output=False, drop=None) 
# encoded = encoder.fit_transform(PimaIndians[categorical_columns])
# encoded_columns = encoder.get_feature_names_out(categorical_columns)
# encoded_df = pd.DataFrame(encoded, columns=encoded_columns)
# PimaIndians_OneHot = pd.concat([encoded_df, PimaIndians["Y"].reset_index(drop=True)], axis=1)

In [None]:
# with open('/Users/simondn/Documents/RashomonActiveLearning/Data/processed/PimaIndians.pkl', 'wb') as file:
#         pickle.dump(PimaIndians_OneHot, file)

In [None]:
# from treeFarms.treefarms.model.treefarms import TREEFARMS
# TreeFarmsModel = TREEFARMS({"regularization": 0.01, "rashomon_bound_multiplier": 0.05})
# TreeFarmsModel.fit(PimaIndians_OneHot.loc[:, PimaIndians_OneHot.columns != "Y"], PimaIndians_OneHot["Y"])

null
Finding Optimal Objective...
treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 14
{
  "false": {
    "complexity": 0.009999999776482582,
    "loss": 0.1419270932674408,
    "name": "Y",
    "prediction": 0
  },
  "feature": 5,
  "model_objective": 0.28171876072883606,
  "name": "Glucose_3",
  "reference": 1.0,
  "relation": "==",
  "true": {
    "complexity": 0.009999999776482582,
    "loss": 0.1197916716337204,
    "name": "Y",
    "prediction": 1
  },
  "type": "rational"
}


<treeFarms.treefarms.model.treefarms.TREEFARMS at 0x11c2fba60>

In [None]:
# 