In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression


from sklearn.tree import DecisionTreeRegressor

In [57]:

# Generate  data
X, Y = make_regression(n_samples=1200, n_features=30,  n_informative = 18, noise=10, random_state=42)

# Initialize the Decision Tree 
blackbox = DecisionTreeRegressor(random_state=42)

# test on tuneBlackBox
best_param = tuneBlackBox(blackbox, X, Y, 'Dropout',  5, 'MSE', monteCarloReplicates = 100 )  

[0.         0.01724138 0.03448276 0.05172414 0.06896552 0.0862069
 0.10344828 0.12068966 0.13793103 0.15517241 0.17241379 0.18965517
 0.20689655 0.22413793 0.24137931 0.25862069 0.27586207 0.29310345
 0.31034483 0.32758621 0.34482759 0.36206897 0.37931034 0.39655172
 0.4137931  0.43103448 0.44827586 0.46551724 0.48275862 0.5       ]
[74493.29389624 80049.86998155 82801.91894734 81184.48730159
 80903.23401794 78229.00148686 84941.27270699 77811.04793429
 76739.91115542 79655.14893686 77344.51727082 84711.82813555
 81053.39039944 88895.73809576 81891.26405744 80831.90707695
 79523.16370202 78276.13292682 75492.27087912 83717.60753761
 80056.09381117 84660.61003105 79434.03528683 90021.2886381
 77855.4684022  84366.53950088 87959.01568351 95649.38025641
 91844.87868122 92301.30679516]
Number of folds for cross-validation: 5
Best parameter for Dropout with MSE loss: 0.0
Average MSE error for the best Dropout parameter: 74493.29389624108


In [56]:
def tuneBlackBox(blackbox, X, Y, regularization, n_folds, loss, monteCarloReplicates = 10, columnBounds = 10):
    
    #split X, Y into k folds
    kf = KFold(n_splits = n_folds)
    
    #Modify code for selected regularization
    if (regularization == 'Dropout'):
        regParam = np.linspace(0, .5, 30)
        regularization_function = DropoutRegularization 
                           
    elif (regularization == 'NoiseAddition'):
       # regParam =  tbd
        regularization_function = NoiseAdditionRegularization 

        print('need to add stuff')
    else: 
        #regParam = tbd
        regularization_function = RobustRegularization 
        print('need to add stuff')

    #results array with length of number of parameters to test    
    results = np.zeros(len(regParam))

    #all regularizations may require normalization of data. may want to create normlaization function to be called before calling regularization
    
    
    # parameters to test in cross validation
    for index, param in enumerate(regParam):
        cumulative_error = 0
        
        #train / fit n_fold times in cross validation                   
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]
                             
            error = regularization_function(X_train, Y_train, X_test, Y_test, monteCarloReplicates, param, blackbox, loss)
            
            # Add error to cumulative error
            cumulative_error = cumulative_error + error
            
        #add error divided by number of folds to cross-val results array
        results[index] = cumulative_error / n_folds
            
    # Find the parameter that minimizes the loss and its error
    min_error_index = np.argmin(results)
    best_param = regParam[min_error_index]
    best_error = results[min_error_index]

    #print(regParam)
    #print(results)
    
    # Print the best parameter and its error
    print(f"Number of folds for cross-validation: {n_folds}")
    print(f"Best parameter for {regularization} with {loss} loss: {best_param}")
    print(f"Average {loss} error for the best {regularization} parameter: {best_error}")

    return best_param    
    #have to return trained model as well

        

                               
                           


In [50]:
def DropoutRegularization(X_train, Y_train, X_test, Y_test, monteCarloReplicates, dropoutParam, blackbox, loss):
    
    
    #need to normalize data
    
    X_train_all_dropout = []
    Y_train_all_replicated = []
    
    for i in range(monteCarloReplicates): 
        #define a mask to subsample/dropout training data. Right now, defined a .2 dropout parameter as 
        #an 80% probability of retainment, / 20% probability of dropout. Can adjust this depending on common meaning of dropout.
        
        dropout_mask = np.random.rand(*X_train.shape) < dropoutParam
        #make copy of X_train so we don't overwrite original X_train with dropout mask
        X_train_dropout = X_train.copy()
        X_train_dropout[dropout_mask] = 0
        if (dropoutParam!=0):
            X_train_dropout = X_train_dropout / (1 - dropoutParam)  
            
        X_train_all_dropout.append(X_train_dropout)
        Y_train_all_replicated.append(Y_train)            
            
    # Concatenate to form a single large training dataset and label set
    X_train_concatenated = np.concatenate(X_train_all_dropout, axis=0)
    Y_train_concatenated = np.concatenate(Y_train_all_replicated, axis=0)
    
    # Fit the model on the concatenated dataset
    fitted_model = blackbox.fit(X_train_concatenated, Y_train_concatenated)
    
    # Predict on the test set and calculate loss
    y_pred = fitted_model.predict(X_test)       
    #calculate loss with the input loss metric
    if loss == 'MSE':
        error = mean_squared_error(Y_test, y_pred)
    if loss == 'MAD':
        error = mean_absolute_error(Y_test, y_pred)
        
    
    #average error over the M montecarlo replicates
    return error
    

In [None]:
#def NoiseAdditionRegularization():


In [None]:
                           
#def RobustRegularization():
