## Imports 

In [23]:
import numpy as np
from sklearn import model_selection
from sklearn import datasets
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from random import *
import warnings
warnings.filterwarnings("ignore")

# AutoML Libraries tpot and auto-sklearn
from tpot import TPOTClassifier
from autosklearn.classification import AutoSklearnClassifier



## Simulated Annealing

In [None]:
# Simulated Annealing function steps
# @parameters 
# sol - random solution (ML model)
# X_train, y_train - training data
# X_test, y_test - testing data
# @return solution and cost

# 1. Generate a random solution
# 2. Calculate its cost using a cost function (accuracy of the ML Model)
# 3. Generate a random neigboring solution
# 4. Calculate new solutions cost (accuracy of the ML model)
# 5. Compare solutions
#     - If c_new > c_old move to the new solution
#     - If c_new < c_old maybe move to the new solution
# 6. Repeat steps until an acceptable solution is found or max number of iterations is reached
def sa(sol, X_train, y_train, X_test, y_test):
    t = 1.0
    t_min = 0.00001
    alpha = 0.9
    old_cost = cost(sol, X_train, y_train, X_test, y_test)
    while t > t_min:
        i = 1
        while i <= 100:
            new_sol = neighbor(sol)
            new_cost = cost(new_sol, X_train, y_train, X_test, y_test)
            ap = acceptance_probability(old_cost, new_cost, t)
            rnd = random()
            # print(ap, rnd)
            if ap > rnd:
                sol = new_sol
                old_cost = new_cost
            i += 1
        t = t * alpha
    return sol, old_cost          

In [None]:
# Function to generate neighboring solution
# Solution is defined as a machine learning model along with a set of parameters
# i.e.,
# solution = {
#     model: 'LinearRegression',
#     parameters: {
#         fit_intercept: true,
#         normalize: true,
#         copy_X: false,
#         n_jobs: 4,
#     }
# }
def neighbor(sol):
    # Use the search_space to find a new neighbor of the current solution and return that
    
    # Chose a random parameter and modify it 
    parameter = choice(list(search_space[sol['name']].keys())) # random.choice()
    # print('Selected parameter ', parameter)
    
    parameter_space = search_space[sol['name']][parameter]
    # print('Parameter space', parameter_space)
    
    # Grab the random parameter from our current solution and change it
    current_parameter_val = sol['parameters'][parameter]
    # print('Current parameter value', current_parameter_val)
    
    # Grab the current index of the selected parameter of our model
    current_index = parameter_space.index(current_parameter_val)
    
    # In one step modify the value of the selected parameter
    if current_index == 0:
        # index = 0 -> index++
        modified_parameter = parameter_space[1];
    elif current_index == len(parameter_space):
        # index = length -> index--
        modified_parameter = parameter_space[current_index - 1]
    else: 
        # index = index + random(-1,1)
        modified_parameter = parameter_space[(current_index + choice([-1, 1])) % len(parameter_space)]
    
    # Create a new solution copy the current one and replace the randomly chosen parameter
    new_sol = sol
    new_sol['parameters'][parameter] = modified_parameter
    
    # print('New parameter value', modified_parameter)
    
    # Return new neighboring solution
    return new_sol

In [None]:
# Function to return the cost of a current solution
# In our case the cost is the accuracy (or we can use other metrics) of the current ML model / parameter configuration
# TODO: RMSE or some other metric?
def cost(sol, X_train, y_train, X_test, y_test): 
    # Get model
    model = get_model(sol['name'], sol['parameters'])
    
    # Train model on data
    model.fit(X_train, y_train)
    
    # Predictions
    # predictions = model.predict(X_test)
    
    # Get accuracy or some other metric and return
    score = model.score(X_test, y_test)
    # print('Score: ', score)
    return score

In [None]:
# Function to get a model with a parameter configuration
def get_model(name, parameters):
    # LogisticRegression
    if name == 'LogisticRegression':
        lr = LogisticRegression(**parameters)
        return lr
    # Model2
    elif name == '':
        return
    # Model3
    elif name == '':
        return
    # Model4
    elif name == '':
        return
    # Model5
    elif name == '':
        return
    # Default
    else:
        return

In [None]:
# Function which recommends if we should jump to a new solutions or not
# 1.0 - definitely switch
# 0.0 - definitely stay put
# 0.5 - 50/50 odds of switching
# Usually calculated by e^((c_new - c_old)/t)

def acceptance_probability(old_cost, new_cost, t): 
    if new_cost > old_cost:
        return 1.0
    else:
        return np.exp((new_cost - old_cost)/t)

## ML Models

In [None]:
# TODO: Define 5 ML models

# Search Space that will be used to define our neighborhood of ML models and hyperparameters
# Basically our dictionary defining the model, its most important parameters, and their value ranges
search_space = {
    'LogisticRegression': {
        'penalty': ['l1', 'l2'],
        'C': np.logspace(-4, 4, 20).tolist(),
        'solver': ['liblinear', 'saga'],
    },
    'Model2': {
        'property1': 'value_range',
        'property2': 'value_range',
        'property3': 'value_range',
        'property4': 'value_range'
    },
    'Model3': {
        'property1': 'value_range',
        'property2': 'value_range',
        'property3': 'value_range',
        'property4': 'value_range'
    },
    'Model4': {
        'property1': 'value_range',
        'property2': 'value_range',
        'property3': 'value_range',
        'property4': 'value_range'
    },
    'Model5': {
        'property1': 'value_range',
        'property2': 'value_range',
        'property3': 'value_range',
        'property4': 'value_range'
    }
}

## Datasets

In [21]:
# TODO: Define datasets

# Iris
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

## Evaluation

In [7]:
# Some random solution that we will pass to the sa() to start with

# LogisticRegression
# Model2
# Model3
# Model4
# Model5

models = []

models.append({
    'name': 'LogisticRegression',
    'parameters': {
        'penalty': 'l2',
        'C': 0.23357214690901212,
        'solver': 'liblinear',
        'n_jobs': -1
    }
})

models.append({
    'name': 'Model2',
    'parameters': {
        
    }
})

models.append({
    'name': 'Model3',
    'parameters': {
        
    }
})

models.append({
    'name': 'Model4',
    'parameters': {
        
    }
})

models.append({
    'name': 'Model5',
    'parameters': {
        
    }
})


In [13]:
# Save all solutions
solutions = []

# Save best solution
best_solution = {
    'solution': '',
    'score': 0
}

# Iterate over models and get the best solution / score
for i in models:
    solution, score = sa(i, X_train, y_train, X_test, y_test)
    solutions.append({
       'solution': solution,
       'score': score
    })
    
    if score > best_solution['score']:
        best_solution = {
            'solution': solution,
            'score': score
        }
    
# Print results
print('Best solution: ', best_solution)
print('All solutions', solutions)

Best solution:  {'solution': 'x', 'score': 0.95}
All solutions [{'solution': 'x', 'score': 0.5}, {'solution': 'x1', 'score': 0.55}, {'solution': 'x2', 'score': 0.15}, {'solution': 'x3', 'score': 0.95}, {'solution': 'x4', 'score': 0.75}, {'solution': 'x5', 'score': 0.25}]


## TPOT

In [19]:
# Get TPOT classification optimizer
tpot_automl = TPOTClassifier(generations = 5, population_size = 20, cv = 5, random_state = 42, n_jobs = -1)

# Fit on dataset
tpot_automl.fit(X_train, y_train)

# Get score
print('Score: ', tpot_automl.score(X_test, y_test))



Score:  1.0


## Auto-sklearn

In [25]:
# Get auto-sklearn classifier
sklearn_automl = AutoSklearnClassifier()

# Fit on dataset
sklearn_automl.fit(X_train, y_train)

# y_hat = sklearn_automl.predict(X_test)

# Get score
print('Score: ', sklearn_automl.score(X_test, y_test))

Score:  0.9777777777777777
