In [1]:
import pandas as pd

from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.utils import shuffle
from sklearn.model_selection import ValidationCurveDisplay
from sklearn.ensemble import HistGradientBoostingClassifier
from ucimlrepo import fetch_ucirepo
import warnings
warnings.filterwarnings("ignore")

## #Get datasets x3
#Shuffle X and y using shuffle
#Encode X (if needed), squeeze y
#Set aside data for "test" to keep the amount of "train" & "validate" consistent across datasets
#ShuffleSplit(n_iter=3, (THIS IS THE 3 TRIALS) testsize=TRY ALL 3) and gridsearch to find hyperparams x3 testsize, x3 classifiers
#set parameters
#Get test error with test set
#Evaluate

In [2]:
def fetch_dataset(name):
    match name:
        case "dry_bean_dataset":
            dry_bean_dataset = fetch_ucirepo(id=602) 

            # data (as pandas dataframes) 
            X = dry_bean_dataset.data.features 
            y = dry_bean_dataset.data.targets 
            y = (y == "DERMASON").squeeze()
        case "adult":
            adult = fetch_ucirepo(id=2) 

            # data (as pandas dataframes) 
            X = adult.data.features 
            y = adult.data.targets
            y = ((y == "<=50K") | (y == "<=50K.")).squeeze()
        case "bank_marketing":
            bank_marketing = fetch_ucirepo(id=222) 
  
            # data (as pandas dataframes) 
            X = bank_marketing.data.features 
            y = bank_marketing.data.targets
            y = (y == "yes").squeeze()
    return X, y

In [3]:
def encoding(X):
    return pd.get_dummies(X)

In [4]:
def split_data(X, y):
    X_sum = X.iloc[:5000]
    y_sum = y.iloc[:5000]
    X_test = X.iloc[5000:]
    y_test = y.iloc[5000:]
    return X_sum, y_sum, X_test, y_test

In [5]:
def data_setup(X, y):
    X, y = shuffle(X, y)#, random_state=0)
    X, y = X.reset_index(drop=True), y.reset_index(drop=True)
    X = encoding(X)
    X_sum, y_sum, X_test, y_test = split_data(X, y)
    return X_sum, y_sum, X_test, y_test

In [24]:
def BST_score(X, y, test_size, train_size):
    X_sum, y_sum, X_test, y_test = data_setup(X, y)
    parameters = {'learning_rate':[0.1, 0.2, 0.3], 'max_iter':[10, 20, 50]}
    # print("train: " + str(train_size))
    # Train test split
    shuffle_split = ShuffleSplit(n_splits=3, test_size=test_size, train_size=train_size)
    # This is equivalent to: 
    #   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # But, it is usable for GridSearchCV

    # GridSearch without CV
    clf = HistGradientBoostingClassifier()
    grid_search = GridSearchCV(clf, param_grid=parameters, cv=shuffle_split, return_train_score=True)
    grid_search.fit(X_sum, y_sum)
    
    results = pd.DataFrame(grid_search.cv_results_)
    best_params = grid_search.best_params_
    train_score = results["mean_train_score"][grid_search.best_index_]
    validation_score = results["mean_test_score"][grid_search.best_index_]
    clf = HistGradientBoostingClassifier(learning_rate=best_params["learning_rate"], max_iter=best_params["max_iter"]).fit(X_sum, y_sum)#, random_state=0)
    test_score = clf.score(X_test, y_test)
    
    return train_score, validation_score, test_score, best_params

In [25]:
X, y = fetch_dataset("dry_bean_dataset")

In [26]:
col = []
for train, test in [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2]]:
    train_score, validation_score, test_score, best_params = BST_score(X, y, train_size=train, test_size=test)
    row = [[train, test], train_score, validation_score, test_score, best_params]
    col.append(row)
df = pd.DataFrame(col)
df.columns = ["Train/Test Split", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "Hyperparameters"]
df

Unnamed: 0,Train/Test Split,Training Accuracy,Validation Accuracy,Testing Accuracy,Hyperparameters
0,"[0.2, 0.8]",0.977,0.952417,0.955754,"{'learning_rate': 0.1, 'max_iter': 20}"
1,"[0.5, 0.5]",0.991333,0.9528,0.957845,"{'learning_rate': 0.2, 'max_iter': 20}"
2,"[0.8, 0.2]",0.98175,0.959,0.954361,"{'learning_rate': 0.2, 'max_iter': 10}"


In [9]:
df.to_clipboard()

In [8]:
print(BST_score(X, y, train_size=0.2, test_size=0.8))

({'learning_rate': 0.3, 'max_iter': 100}, 1.0, 0.9498333333333333, 0.9526187434676576)


In [9]:
print(BST_score(X, y, train_size=0.5, test_size=0.5))

({'learning_rate': 0.2, 'max_iter': 200}, 1.0, 0.9549333333333334, 0.9549413540819881)


In [10]:
print(BST_score(X, y, train_size=0.8, test_size=0.2))

({'learning_rate': 0.1, 'max_iter': 100}, 1.0, 0.9533333333333333, 0.9526187434676576)


In [27]:
X, y = fetch_dataset("adult")

In [28]:
col = []
for train, test in [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2]]:
    train_score, validation_score, test_score, best_params = BST_score(X, y, train_size=train, test_size=test)
    row = [[train, test], train_score, validation_score, test_score, best_params]
    col.append(row)
df = pd.DataFrame(col)
df.columns = ["Train/Test Split", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "Hyperparameters"]
df

Unnamed: 0,Train/Test Split,Training Accuracy,Validation Accuracy,Testing Accuracy,Hyperparameters
0,"[0.2, 0.8]",0.91,0.848833,0.858515,"{'learning_rate': 0.1, 'max_iter': 20}"
1,"[0.5, 0.5]",0.8936,0.855067,0.857625,"{'learning_rate': 0.1, 'max_iter': 20}"
2,"[0.8, 0.2]",0.911167,0.862667,0.863145,"{'learning_rate': 0.2, 'max_iter': 20}"


In [12]:
df.to_clipboard()

In [12]:
print(BST_score(X, y, train_size=0.2, test_size=0.8))

({'learning_rate': 0.1, 'max_iter': 100}, 1.0, 0.8328333333333333, 0.8609552483919529)


In [13]:
print(BST_score(X, y, train_size=0.5, test_size=0.5))

({'learning_rate': 0.1, 'max_iter': 100}, 0.9738666666666668, 0.8473333333333333, 0.860316591396378)


In [14]:
print(BST_score(X, y, train_size=0.8, test_size=0.2))

({'learning_rate': 0.1, 'max_iter': 100}, 0.9454166666666666, 0.855, 0.8593586059030154)


In [13]:
X, y = fetch_dataset("bank_marketing")

In [14]:
col = []
for train, test in [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2]]:
    train_score, validation_score, test_score, best_params = BST_score(X, y, train_size=train, test_size=test)
    row = [[train, test], train_score, validation_score, test_score, best_params]
    col.append(row)
df = pd.DataFrame(col)
df.columns = ["Train/Test Split", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "Hyperparameters"]
df

Unnamed: 0,Train/Test Split,Training Accuracy,Validation Accuracy,Testing Accuracy,Hyperparameters
0,"[0.2, 0.8]",1.0,0.90175,0.90147,"{'learning_rate': 0.1, 'max_iter': 100}"
1,"[0.5, 0.5]",0.999867,0.895467,0.9005,"{'learning_rate': 0.1, 'max_iter': 100}"
2,"[0.8, 0.2]",0.999833,0.899667,0.898834,"{'learning_rate': 0.2, 'max_iter': 100}"


In [15]:
df.to_clipboard()

In [16]:
print(BST_score(X, y, train_size=0.2, test_size=0.8))

({'learning_rate': 0.1, 'max_iter': 100}, 1.0, 0.8911666666666668, 0.9018179105219964)


In [17]:
print(BST_score(X, y, train_size=0.5, test_size=0.5))

({'learning_rate': 0.2, 'max_iter': 300}, 1.0, 0.8852000000000001, 0.8979881127054786)


In [18]:
print(BST_score(X, y, train_size=0.8, test_size=0.2))

({'learning_rate': 0.1, 'max_iter': 200}, 1.0, 0.8940000000000001, 0.8985600954962573)
