## Imports 

In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# General
import numpy as np
import pandas as pd 
from random import *

# SK general 
from sklearn import model_selection
from sklearn import datasets
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# AutoML Libraries tpot and auto-sklearn
from tpot import TPOTClassifier
# from autosklearn.classification import AutoSklearnClassifier

## Simulated Annealing

In [10]:
# Simulated Annealing function steps
# @parameters 
# sol - random solution (ML model)
# X_train, y_train - training data
# X_test, y_test - testing data
# @return solution and cost

# 1. Generate a random solution
# 2. Calculate its cost using a cost function (accuracy of the ML Model)
# 3. Generate a random neigboring solution
# 4. Calculate new solutions cost (accuracy of the ML model)
# 5. Compare solutions
#     - If c_new > c_old move to the new solution
#     - If c_new < c_old maybe move to the new solution
# 6. Repeat steps until an acceptable solution is found or max number of iterations is reached
def sa(sol, X_train, y_train, X_test, y_test):
    t = 1.0
    t_min = 0.00001
    alpha = 0.9
    old_cost = cost(sol, X_train, y_train, X_test, y_test)
    while t > t_min:
        i = 1
        while i <= 100:
            new_sol = neighbor(sol)
            new_cost = cost(new_sol, X_train, y_train, X_test, y_test)
            ap = acceptance_probability(old_cost, new_cost, t)
            rnd = random()
            # print(ap, rnd)
            if ap > rnd:
                sol = new_sol
                old_cost = new_cost
            i += 1
        t = t * alpha
    return sol, old_cost          

In [11]:
# Function to generate neighboring solution
# Solution is defined as a machine learning model along with a set of parameters
# i.e.,
# solution = {
#     model: 'LinearRegression',
#     parameters: {
#         fit_intercept: true,
#         normalize: true,
#         copy_X: false,
#         n_jobs: 4,
#     }
# }
def neighbor(sol):
    # Use the search_space to find a new neighbor of the current solution and return that
    
    # Chose a random parameter and modify it 
    parameter = choice(list(search_space[sol['name']].keys())) # random.choice()
    # print('Selected parameter ', parameter)
    
    parameter_is_tuple = type(search_space[sol['name']][parameter]) is tuple
    
    if parameter_is_tuple:
        parameter_space = search_space[sol['name']][parameter][0]
    else:
        parameter_space = search_space[sol['name']][parameter]
    # print('Parameter space', parameter_space)
    
    # Grab the random parameter from our current solution and change it
    if parameter_is_tuple:
        current_parameter_val = sol['parameters'][parameter][0]
    else:
        current_parameter_val = sol['parameters'][parameter]
    # print('Current parameter value', current_parameter_val)
    
    # Grab the current index of the selected parameter of our model
    current_index = parameter_space.index(current_parameter_val)
    
    # In one step modify the value of the selected parameter
    if current_index == 0:
        # index = 0 -> index++
        if parameter_is_tuple:
            modified_parameter = (parameter_space[1],)
        else:
            modified_parameter = parameter_space[1];
    elif current_index == len(parameter_space):
        # index = length -> index--
        if parameter_is_tuple:
            modified_parameter = (parameter_space[current_index - 1],)
        else:
            modified_parameter = parameter_space[current_index - 1]
    else: 
        # index = index + random(-1,1)
        if parameter_is_tuple:
             modified_parameter = (parameter_space[(current_index + choice([-1, 1])) % len(parameter_space)],)
        else:
            modified_parameter = parameter_space[(current_index + choice([-1, 1])) % len(parameter_space)]
    
    # Create a new solution copy the current one and replace the randomly chosen parameter
    new_sol = sol
    new_sol['parameters'][parameter] = modified_parameter
    
    # print('New parameter value', modified_parameter)
    
    # Return new neighboring solution
    return new_sol

In [12]:
# Function to return the cost of a current solution
# In our case the cost is the accuracy (or we can use other metrics) of the current ML model / parameter configuration
# TODO: RMSE or some other metric?
def cost(sol, X_train, y_train, X_test, y_test): 
    # Get model
    model = get_model(sol['name'], sol['parameters'])
    
    # Train model on data
    model.fit(X_train, y_train)
    
    # Predictions
    # predictions = model.predict(X_test)
    
    # Get accuracy or some other metric and return
    score = model.score(X_test, y_test)
    # print('Score: ', score)
    return score

In [13]:
# Function to get a model with a parameter configuration
def get_model(name, parameters):
    # LogisticRegression
    if name == 'LogisticRegression':
        lr = LogisticRegression(**parameters)
        return lr
    # Model2
    elif name == 'MLPClassifier':
        abc = MLPClassifier(**parameters)
        return abc
    # Model3
    elif name == 'SGDClassifier':
        sgd = SGDClassifier(**parameters)
        return sgd
    # Model4
    elif name == 'SVC':
        svc = SVC(**parameters)
        return svc
    # Model5
    elif name == 'RandomForestClassifier':
        rfc = RandomForestClassifier(**parameters)
        return rfc
    # Default
    else:
        print('No model provided')
        return

In [14]:
# Function which recommends if we should jump to a new solutions or not
# 1.0 - definitely switch
# 0.0 - definitely stay put
# 0.5 - 50/50 odds of switching
# Usually calculated by e^((c_new - c_old)/t)

def acceptance_probability(old_cost, new_cost, t): 
    if new_cost > old_cost:
        return 1.0
    else:
        return np.exp((new_cost - old_cost)/t)

## ML Models

In [15]:
# Search Space that will be used to define our neighborhood of ML models and hyperparameters
# Basically our dictionary defining the model, its most important parameters, and their value ranges
search_space = {
    'LogisticRegression': {
        'penalty': ['l1', 'l2'],
        'C': np.logspace(-4, 4, 20).tolist(),
        'solver': ['liblinear', 'saga'],
    },
    'MLPClassifier': {
        'hidden_layer_sizes': (np.arange(1, 100, 1).tolist(), ),
        'activation': ['identity', 'logistic', 'tanh', 'relu'],
        'solver': ['lbfgs', 'sgd', 'adam'],
        'alpha': np.logspace(-4, 1, 10).tolist(),
        'learning_rate': ['constant', 'invscaling', 'adaptive'],       
    },
    'SGDClassifier': {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        'alpha': np.logspace(-4, 1, 10).tolist(),
        'eta0': np.logspace(-4, 1, 10).tolist(),
        'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']
    },
    'SVC': {
        'C': np.logspace(-4, 1, 10).tolist(),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto'],
    },
    'RandomForestClassifier': {
        'n_estimators': np.arange(1, 100, 1).tolist(),
        'criterion': ['gini', 'entropy'],
        'max_depth': np.arange(2, 50, 1).tolist(),
        'min_samples_split': np.arange(0.1, 1.1, 0.1).tolist()
    }
}

## Datasets

In [8]:
# Iris
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [14]:
# Handwritten Digits
digits = datasets.load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [4]:
# Wine Quality Data 3
wwine = pd.read_csv("./data/winequality-white.csv", sep=';')
labels = ["poor", "average", "excellent"]
wwine['class'] = pd.cut(wwine['quality'], bins = 3, labels = labels)
wwine['class'] = pd.cut(wwine['quality'], bins = 3, labels = False)

X = wwine[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']]
y = wwine['class']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.33,random_state=123)

# standadise
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [12]:
# Dataset 4

## Evaluation

In [17]:
# Some random solution that we will pass to the sa() to start with
models = []

# LogisticRegression
models.append({
    'name': 'LogisticRegression',
    'parameters': {
        'penalty': 'l2',
        'C': 0.23357214690901212,
        'solver': 'liblinear',
        'n_jobs': -1
    }
})

# MLPClassifier
models.append({
    'name': 'MLPClassifier',
    'parameters': {
        'hidden_layer_sizes': (1,),
        'activation': 'relu',
        'solver': 'adam',
        'alpha': 0.0001,
        'learning_rate': 'constant',       
    }
})

# # SGDClassifier
models.append({
    'name': 'SGDClassifier',
    'parameters': {
        'penalty': 'l1',
        'loss': 'hinge',
        'alpha': 0.0001,
        'eta0': 0.0001,
        'learning_rate': 'constant'
    }
})

# # SVC
models.append({
    'name': 'SVC',
    'parameters': {
        'C': 0.0001,
        'kernel': 'linear',
        'gamma': 'scale',
    }
})

# RandomForest
models.append({
    'name': 'RandomForestClassifier',
    'parameters': {
        'n_estimators': 1,
        'criterion': 'gini',
        'max_depth': 2,
        'min_samples_split': 0.1,
    }
})

In [18]:
# Save all solutions
solutions = []

# Save best solution
best_solution = {
    'solution': '',
    'score': 0
}

# Iterate over models and get the best solution / score
for i in models:
    solution, score = sa(i, X_train, y_train, X_test, y_test)
    print('Done with ', solution['name'])
    solutions.append({
       'solution': solution,
       'score': score
    })
    
    if score > best_solution['score']:
        best_solution = {
            'solution': solution,
            'score': score
        }
    
# Print results
print('Best solution: ', best_solution)
print('All solutions', solutions)

Done with  LogisticRegression
Done with  MLPClassifier
Done with  SGDClassifier
Done with  SVC
Done with  RandomForestClassifier
Best solution:  {'solution': {'name': 'MLPClassifier', 'parameters': {'hidden_layer_sizes': (68,), 'activation': 'logistic', 'solver': 'sgd', 'alpha': 0.016681005372000592, 'learning_rate': 'constant'}}, 'score': 0.7520098948670377}
All solutions [{'solution': {'name': 'LogisticRegression', 'parameters': {'penalty': 'l1', 'C': 10000.0, 'solver': 'liblinear', 'n_jobs': -1}}, 'score': 0.7235621521335807}, {'solution': {'name': 'MLPClassifier', 'parameters': {'hidden_layer_sizes': (68,), 'activation': 'logistic', 'solver': 'sgd', 'alpha': 0.016681005372000592, 'learning_rate': 'constant'}}, 'score': 0.7520098948670377}, {'solution': {'name': 'SGDClassifier', 'parameters': {'penalty': 'l1', 'loss': 'squared_loss', 'alpha': 0.004641588833612782, 'eta0': 0.0001, 'learning_rate': 'invscaling'}}, 'score': 0.7309833024118738}, {'solution': {'name': 'SVC', 'parameters'

## TPOT

In [6]:
# Get TPOT classification optimizer
tpot_automl = TPOTClassifier(generations = 5, population_size = 20, cv = 5, random_state = 42, n_jobs = -1)

# Fit on dataset
tpot_automl.fit(X_train, y_train)

# Get score
print('Score: ', tpot_automl.score(X_test, y_test))
tpot_automl.export('tpot_whitewine_pipeline.py')
# tpot_automl.export('tpot_iris_pipeline.py')
# tpot_automl.export('tpot_digits_pipeline.py')

Score:  0.769325912183055


## Auto-sklearn

In [12]:
# Get auto-sklearn classifier
sklearn_automl = AutoSklearnClassifier()

# Fit on dataset
sklearn_automl.fit(X_train, y_train)

# y_hat = sklearn_automl.predict(X_test)

# Get score
print('Score: ', sklearn_automl.score(X_test, y_test))
print(sklearn_automl.sprint_statistics())
print(sklearn_automl.show_models())

Score:  1.0
auto-sklearn results:
  Dataset name: ed217d05461c6dff1ebbcd34c23a0766
  Metric: accuracy
  Best validation score: 0.971429
  Number of target algorithm runs: 3168
  Number of successful target algorithm runs: 2928
  Number of crashed target algorithm runs: 240
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0

[(1.000000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'categorical_encoding:__choice__': 'no_encoding', 'classifier:__choice__': 'k_nearest_neighbors', 'imputation:strategy': 'mean', 'preprocessor:__choice__': 'liblinear_svc_preprocessor', 'rescaling:__choice__': 'minmax', 'classifier:k_nearest_neighbors:n_neighbors': 2, 'classifier:k_nearest_neighbors:p': 2, 'classifier:k_nearest_neighbors:weights': 'distance', 'preprocessor:liblinear_svc_preprocessor:C': 418.0268755058258, 'preprocessor:liblinear_svc_preprocessor:dual': 'False', 'preprocessor:liblinear_svc_preprocessor:fit

## Results

In [None]:
# Iris data output
# Done with  LogisticRegression
# Done with  MLPClassifier
# Done with  SGDClassifier
# Done with  SVC
# Done with  RandomForestClassifier
# Best solution
iris_best = {
    'solution': {
        'name': 'LogisticRegression', 
        'parameters': {
            'penalty': 'l2', 
            'C': 0.0018329807108324356, 
            'solver': 'saga', 
            'n_jobs': -1
        }
    }, 
    'score': 1.0
}
#All solutions
iris_all = [{
    'solution': {
        'name': 'LogisticRegression', 
        'parameters': {
            'penalty': 'l2', 
            'C': 0.0018329807108324356, 
            'solver': 'saga', 
            'n_jobs': -1
        }
    }, 
    'score': 1.0
}, 
{
    'solution': {
        'name': 'MLPClassifier', 
        'parameters': {
            'hidden_layer_sizes': (48,), 
            'activation': 'tanh', 
            'solver': 'sgd', 
            'alpha': 0.05994842503189409, 
            'learning_rate': 'constant'
        }
    }, 
    'score': 1.0
}, 
{
    'solution': {
        'name': 'SGDClassifier', 
        'parameters': {
            'penalty': 'l1', 
            'loss': 'modified_huber', 
            'alpha': 0.001291549665014884, 
            'eta0': 0.0001, 
            'learning_rate': 'adaptive'
        }
    }, 
    'score': 1.0
}, 
{
    'solution': {
        'name': 'SVC', 
     'parameters': {
         'C': 0.05994842503189409, 
         'kernel': 'poly', 
         'gamma': 'scale'
     }
    }, 
    'score': 1.0
}, 
{
    'solution': {
        'name': 'RandomForestClassifier', 
        'parameters': {
             'n_estimators': 82, 
             'criterion': 'gini', 
             'max_depth': 9, 
             'min_samples_split': 0.1
        }
    }, 
    'score': 1.0
}]

In [None]:
# Digits dataset output
digits_best =  {
    'solution': {
        'name': 'SVC', 
        'parameters': {
            'C': 0.00035938136638046257, 
            'kernel': 'poly', 
            'gamma': 'scale'
        }
    }, 
    'score': 0.9907407407407407
}

digits_all = [{
    'solution': {
        'name': 'LogisticRegression', 
        'parameters': {
            'penalty': 'l2', 
            'C': 0.00026366508987303583, 
            'solver': 'saga', 
            'n_jobs': -1
        }
    }, 
    'score': 0.9722222222222222
}, {
    'solution': {
        'name': 'MLPClassifier', 
        'parameters': {
            'hidden_layer_sizes': (15,), 
            'activation': 'tanh', 
            'solver': 'sgd', 
            'alpha': 0.00035938136638046257, 
            'learning_rate': 'adaptive'
        }
    }, 
    'score': 0.987037037037037
}, {
    'solution': {
        'name': 'SGDClassifier', 
        'parameters': {
            'penalty': 'elasticnet', 
            'loss': 'hinge', 
            'alpha': 0.00035938136638046257, 
            'eta0': 0.001291549665014884, 
            'learning_rate': 'constant'
        }
    }, 
    'score': 0.9740740740740741
}, {
    'solution': {
        'name': 'SVC', 
        'parameters': {
            'C': 0.00035938136638046257, 
            'kernel': 'poly', 
            'gamma': 'scale'
        }
    }, 
    'score': 0.9907407407407407
}, {
    'solution': {
        'name': 'RandomForestClassifier', 
        'parameters': {
            'n_estimators': 16, 
            'criterion': 'gini', 
            'max_depth': 15, 
            'min_samples_split': 0.2
        }
    }, 
    'score': 0.9055555555555556
}]

In [None]:
# White Wine Dataset
Whitewine_best_solution = { 'solution': 
                           { 'name': 'MLPClassifier', 'parameters': 
                            { 'hidden_layer_sizes': (68), 'activation': 'logistic', 'solver': 'sgd', 'alpha': 0.016681005372000592, 'learning_rate': 'constant' }
                           }, 
                           'score': 0.7520098948670377 }

Whitewine_all_solutions = [
    { 'solution': { 'name': 'LogisticRegression', 'parameters': { 'penalty': 'l1', 'C': 10000.0, 'solver': 'liblinear', 'n_jobs': -1 } }, 'score': 0.7235621521335807 }, 
    { 'solution': { 'name': 'MLPClassifier', 'parameters': { 'hidden_layer_sizes': (68), 'activation': 'logistic', 'solver': 'sgd', 'alpha': 0.016681005372000592, 'learning_rate': 'constant' } }, 'score': 0.7520098948670377 }, 
    { 'solution': { 'name': 'SGDClassifier', 'parameters': { 'penalty': 'l1', 'loss': 'squared_loss', 'alpha': 0.004641588833612782, 'eta0': 0.0001, 'learning_rate': 'invscaling' } }, 'score': 0.7309833024118738 }, 
    { 'solution': { 'name': 'SVC', 'parameters': { 'C': 0.001291549665014884, 'kernel': 'poly', 'gamma': 'auto' } }, 'score': 0.7421150278293135 }, 
    { 'solution': { 'name': 'RandomForestClassifier', 'parameters': { 'n_estimators': 28, 'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 0.1 } }, 'score': 0.7396413110698825 }
]

# TPot Score
Score = "0.769325912183055"