In [1]:
from coreset.decision_tree import dt_coreset
from coreset.utils.formats import SparseData
from data.datasets import get_circles, get_air_quality,get_moons, quantize_data,get_gesture_phase
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.utils import resample

In [2]:
def scale_data(X, Y):
    # with_mean= False could help
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    Y = scaler.fit_transform(Y.reshape((-1, 1)))[:, 0]
    return X, Y

In [3]:
def evaluate_on_full_data(X_train, Y_train, X_test, Y_test, k):
    ''' Model training on full data to compare with coreset results,
        returns mean squared error on the testing set after training
        on the full training dataset '''
    model = RandomForestRegressor(max_leaf_nodes=k,n_jobs=-1,random_state=1)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    return mean_squared_error(Y_test, Y_pred)

In [4]:
def evaluate_on_coreset(coreset, X_train, Y_train, X_test, Y_test, k):
    ''' Model training on coreset, returns mean squared error on the
        testing set after training on a subset of the training dataset '''
    X_coreset, Y_coreset, weights = coreset.X, coreset.Y, coreset.weights
    model_coreset = RandomForestRegressor(max_leaf_nodes=k,n_jobs=-1,random_state=1)
    model_coreset.fit(X_coreset, Y_coreset, sample_weight=weights)
    Y_pred_coreset = model_coreset.predict(X_test)
    return mean_squared_error(Y_test, Y_pred_coreset)

In [5]:
def hyperparameter_tuning(X_train, Y_train, X_test, Y_test, param_grid):
    rf = RandomForestRegressor(n_jobs=-1, random_state=1)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, Y_train)

    best_k = grid_search.best_params_['max_leaf_nodes']
    print(f"Best k value: {best_k}")

    best_model = grid_search.best_estimator_
    Y_pred = best_model.predict(X_test)
    mse = mean_squared_error(Y_test, Y_pred)
    print(f"Mean Squared Error with best k: {mse}")

In [6]:
if __name__ == "__main__":
    # X,Y=get_air_quality(3000)
    # X,Y=get_circles(24000,10000)
    X,Y=get_gesture_phase(3000)
    X, Y = scale_data(X, Y)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
    data_train = SparseData(X_train, Y_train)
    
    # epsilons for air quality
    epsilons=[0.0045, 0.0079, 0.016, 0.0165, 0.0194,0.0254 , 0.0266, 0.0328 , 0.0368, 0.0369, 0.04]
    # epsilons for circle
    # epsilons=[0.01,0.2,0.4,0.6]
    k_values = [2000]
    k=2000
    coreset_verbose = True 
    use_exact_bicriteria_values=[True]
    error_full = evaluate_on_full_data(X_train, Y_train, X_test, Y_test, k)
    for use_exact_bicriteria in use_exact_bicriteria_values:
        print("\nConstructing coresets using exact bicriteria: {}\n".format(
            use_exact_bicriteria))
        for epsilon in epsilons:
            print("\nConstructing coreset for epsilon = {}\n".format(epsilon))
            for k in k_values:
                print("\nConstructing coreset for k = {}\n".format(k))
                # coreset construction
                coreset, coreset_smoothed = dt_coreset(
                    data_train, k, epsilon, verbose=coreset_verbose,
                    use_exact_bicriteria=use_exact_bicriteria)

                # error_coreset=0
                # error_coreset_smoothed=0
                # for i in range(10):
                #     error_coreset += evaluate_on_coreset(
                #         coreset, X_train, Y_train, X_test, Y_test, k)*0.1
                #     error_coreset_smoothed += evaluate_on_coreset(
                #         coreset_smoothed, X_train, Y_train, X_test, Y_test, k)*0.1
                error_coreset=evaluate_on_coreset(coreset, X_train, Y_train, X_test, Y_test, k)
                error_coreset_smoothed=evaluate_on_coreset(coreset_smoothed, X_train, Y_train, X_test, Y_test, k)

                print(("Using 100% of the training set ({} examples):\n" +
                       "\tTesting error (full data):\t\t{:.5f}").format(
                          len(X_train), error_full))
                print(("Using {:.2f}% of the training set " +
                      " (coreset of {} examples):").format(
                          coreset.size / float(len(X_train)) * 100,
                          coreset.size))
                
                # pick a uniform sample of the size of the coreset from full data and train on it. then calculate the error on in
                uniform_sample_indices=np.random.choice(len(X_train),coreset.size,replace=False)
                X_uniform_sample = X_train[uniform_sample_indices]
                Y_uniform_sample = Y_train[uniform_sample_indices]
                error_uniform_sample = evaluate_on_full_data(X_uniform_sample, Y_uniform_sample, X_test, Y_test, k)

                    
                print("\tTesting error (original coreset):\t{:.5f}".format(
                      error_coreset))
                print("\tTesting error (smoothed coreset):\t{:.5f}".format(
                      error_coreset_smoothed))
                print("\tTesting error (uniform sample):\t\t{:.5f}".format(error_uniform_sample))
                # save results to csv
                with open("./Results/top_GP_rf.csv", "a") as f:
                    f.write("{},{},{},{},{},{},{}\n".format(epsilon,k, coreset.size,error_full, error_coreset, error_coreset_smoothed,error_uniform_sample))
        
    
    
    


Constructing coresets using exact bicriteria: True


Constructing coreset for epsilon = 0.0045


Constructing coreset for k = 2000

Expected approximate coreset size: 49383
bicriteria: epsilon=0.0045 sigma=35144.94978803002
bicriteria segments:  2000
balanced partition: alpha = 1.00000, beta = 1.00000,gamma = 0.00450, sigma = 35144.94979:
balanced partition segments: 2080
Using 100% of the training set (142444 examples):
	Testing error (full data):		0.31302
Using 5.83% of the training set  (coreset of 8300 examples):
	Testing error (original coreset):	0.68053
	Testing error (smoothed coreset):	0.79320
	Testing error (uniform sample):		0.46777

Constructing coreset for epsilon = 0.0079


Constructing coreset for k = 2000

Expected approximate coreset size: 16024
bicriteria: epsilon=0.0079 sigma=35144.94978803002
bicriteria segments:  2000
balanced partition: alpha = 1.00000, beta = 1.00000,gamma = 0.00790, sigma = 35144.94979:
balanced partition segments: 1316
Using 100% of the trainin