In [1]:
import pandas as pd

from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.utils import shuffle
from sklearn.model_selection import ValidationCurveDisplay
from sklearn.linear_model import LogisticRegression
from ucimlrepo import fetch_ucirepo
import warnings
warnings.filterwarnings("ignore")

## #Get datasets x3
#Shuffle X and y using shuffle
#Encode X (if needed), squeeze y
#Set aside data for "test" to keep the amount of "train" & "validate" consistent across datasets
#ShuffleSplit(n_iter=3, (THIS IS THE 3 TRIALS) testsize=TRY ALL 3) and gridsearch to find hyperparams x3 testsize, x3 classifiers
#set parameters
#Get test error with test set
#Evaluate

In [2]:
def fetch_dataset(name):
    match name:
        case "dry_bean_dataset":
            dry_bean_dataset = fetch_ucirepo(id=602) 

            # data (as pandas dataframes) 
            X = dry_bean_dataset.data.features 
            y = dry_bean_dataset.data.targets 
            y = (y == "DERMASON").squeeze()
        case "adult":
            adult = fetch_ucirepo(id=2) 

            # data (as pandas dataframes) 
            X = adult.data.features 
            y = adult.data.targets
            y = ((y == "<=50K") | (y == "<=50K.")).squeeze()
        case "bank_marketing":
            bank_marketing = fetch_ucirepo(id=222) 
  
            # data (as pandas dataframes) 
            X = bank_marketing.data.features 
            y = bank_marketing.data.targets
            y = (y == "yes").squeeze()
    return X, y

In [3]:
def encoding(X):
    return pd.get_dummies(X)

In [4]:
def split_data(X, y):
    X_sum = X.iloc[:5000]
    y_sum = y.iloc[:5000]
    X_test = X.iloc[5000:]
    y_test = y.iloc[5000:]
    return X_sum, y_sum, X_test, y_test

In [5]:
def data_setup(X, y):
    X, y = shuffle(X, y)#, random_state=0)
    X, y = X.reset_index(drop=True), y.reset_index(drop=True)
    X = encoding(X)
    X_sum, y_sum, X_test, y_test = split_data(X, y)
    return X_sum, y_sum, X_test, y_test

In [6]:
def LR_score(X, y, train_size, test_size):
    X_sum, y_sum, X_test, y_test = data_setup(X, y)
    parameters = {'C':[0.01, 0.1, 1, 10], 'max_iter':[100, 200, 300]}
    # print("train: " + str(train_size))
    # Train test split
    shuffle_split = ShuffleSplit(n_splits=3, train_size=train_size, test_size=test_size)


    # GridSearch without CV
    clf = LogisticRegression()
    grid_search = GridSearchCV(clf, param_grid=parameters, cv=shuffle_split, return_train_score=True)
    grid_search.fit(X_sum, y_sum)
    
    results = pd.DataFrame(grid_search.cv_results_)
    best_params = grid_search.best_params_
    train_score = results["mean_train_score"][grid_search.best_index_]
    validation_score = results["mean_test_score"][grid_search.best_index_]
    clf = LogisticRegression(C=best_params["C"], max_iter=best_params["max_iter"]).fit(X_sum, y_sum)#, random_state=0)
    test_score = clf.score(X_test, y_test)
    
    return train_score, validation_score, test_score, best_params

In [7]:
def make_list(train_score, validation_score, test_score, best_params):
    row = [train_score, validation_score, test_score, best_params]

In [41]:
X, y = fetch_dataset("dry_bean_dataset")

In [44]:
y.value_counts()

Class
False    10065
True      3546
Name: count, dtype: int64

In [42]:
X_sum, y_sum, X_test, y_test = data_setup(X, y)

In [43]:
y_test

5000      True
5001     False
5002     False
5003     False
5004     False
         ...  
13606     True
13607    False
13608    False
13609     True
13610    False
Name: Class, Length: 8611, dtype: bool

In [31]:
col = []
for train, test in [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2]]:
    train_score, validation_score, test_score, best_params = LR_score(X, y, train_size=train, test_size=test)
    row = [[train, test], train_score, validation_score, test_score, best_params]
    col.append(row)
df = pd.DataFrame(col)
df.columns = ["Train/Test Split", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "Hyperparameters"]
df

Unnamed: 0,Train/Test Split,Training Accuracy,Validation Accuracy,Testing Accuracy,Hyperparameters
0,"[0.2, 0.8]",0.938333,0.93925,0.928812,"{'C': 1, 'max_iter': 200}"
1,"[0.5, 0.5]",0.9472,0.9432,0.951806,"{'C': 1, 'max_iter': 200}"
2,"[0.8, 0.2]",0.954583,0.947,0.952619,"{'C': 1, 'max_iter': 200}"


In [32]:
df.to_clipboard()

In [8]:
print(LR_score(X, y, train_size=0.2, test_size=0.8))

({'C': 10, 'max_iter': 200}, 0.9366666666666666, 0.9400833333333334, 0.9522703518755081)


In [9]:
print(LR_score(X, y, train_size=0.5, test_size=0.5))

({'C': 1, 'max_iter': 200}, 0.9433333333333334, 0.9436, 0.9296248983857857)


In [10]:
print(LR_score(X, y, train_size=0.8, test_size=0.2))

({'C': 0.1, 'max_iter': 200}, 0.95775, 0.9553333333333334, 0.9282313320171873)


In [45]:
X, y = fetch_dataset("adult")

In [46]:
y.value_counts()

income
True     37155
False    11687
Name: count, dtype: int64

In [34]:
col = []
for train, test in [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2]]:
    train_score, validation_score, test_score, best_params = LR_score(X, y, train_size=train, test_size=test)
    row = [[train, test], train_score, validation_score, test_score, best_params]
    col.append(row)
df = pd.DataFrame(col)
df.columns = ["Train/Test Split", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "Hyperparameters"]
df

Unnamed: 0,Train/Test Split,Training Accuracy,Validation Accuracy,Testing Accuracy,Hyperparameters
0,"[0.2, 0.8]",0.813,0.812167,0.797295,"{'C': 10, 'max_iter': 200}"
1,"[0.5, 0.5]",0.808,0.809467,0.795995,"{'C': 1, 'max_iter': 200}"
2,"[0.8, 0.2]",0.81075,0.820333,0.799234,"{'C': 10, 'max_iter': 200}"


In [36]:
df.to_clipboard()

In [12]:
print(LR_score(X, y, train_size=0.2, test_size=0.8))

({'C': 0.01, 'max_iter': 100}, 0.801, 0.8003333333333335, 0.7956069522375804)


In [13]:
print(LR_score(X, y, train_size=0.5, test_size=0.5))

({'C': 0.01, 'max_iter': 100}, 0.8048000000000001, 0.7990666666666666, 0.7980247251494001)


In [14]:
print(LR_score(X, y, train_size=0.8, test_size=0.2))

({'C': 0.1, 'max_iter': 100}, 0.8077500000000001, 0.8136666666666666, 0.7974316865106519)


In [47]:
X, y = fetch_dataset("bank_marketing")

In [48]:
y.value_counts()

y
False    39922
True      5289
Name: count, dtype: int64

In [38]:
col = []
for train, test in [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2]]:
    train_score, validation_score, test_score, best_params = LR_score(X, y, train_size=train, test_size=test)
    row = [[train, test], train_score, validation_score, test_score, best_params]
    col.append(row)
df = pd.DataFrame(col)
df.columns = ["Train/Test Split", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "Hyperparameters"]
df

Unnamed: 0,Train/Test Split,Training Accuracy,Validation Accuracy,Testing Accuracy,Hyperparameters
0,"[0.2, 0.8]",0.878333,0.885583,0.892517,"{'C': 10, 'max_iter': 200}"
1,"[0.5, 0.5]",0.9,0.892933,0.894705,"{'C': 1, 'max_iter': 300}"
2,"[0.8, 0.2]",0.886833,0.886667,0.891771,"{'C': 1, 'max_iter': 300}"


In [39]:
df.to_clipboard()

In [16]:
print(LR_score(X, y, train_size=0.2, test_size=0.8))

({'C': 10, 'max_iter': 300}, 0.898, 0.8898333333333334, 0.8979632438884882)


In [17]:
print(LR_score(X, y, train_size=0.5, test_size=0.5))

({'C': 0.1, 'max_iter': 300}, 0.9041333333333333, 0.8998666666666667, 0.8908507622292408)


In [18]:
print(LR_score(X, y, train_size=0.8, test_size=0.2))

({'C': 1, 'max_iter': 200}, 0.9009166666666667, 0.8930000000000001, 0.8930392181243938)
