Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Template opt #3

Merged
merged 3 commits into from
Apr 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 103 additions & 51 deletions tpot/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
from .config.classifier_sparse import classifier_config_sparse

from .metrics import SCORERS
from .gp_types import Output_Array, Transformed_Array, Selected_Array
from .gp_types import Output_Array
from .gp_deap import eaMuPlusLambda, mutNodeReplacement, _wrapped_cross_val_score, cxOnePoint

# hot patch for Windows: solve the problem of crashing python after Ctrl + C in Windows OS
Expand Down Expand Up @@ -200,7 +200,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
String 'TPOT sparse':
TPOT uses a configuration dictionary with a one-hot-encoder and the
operators normally included in TPOT that also support sparse matrices.
template: (default: None)
template: Python list (default: None)
A template for pipeline structure
warm_start: bool, optional (default: False)
Flag indicating whether the TPOT instance will reuse the population from
Expand Down Expand Up @@ -286,18 +286,29 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
self.config_dict_params = config_dict
self._setup_config(self.config_dict_params)

self.template = template
if template:
self.fixed_length = len(template.split('-'))
# for now, the template is only a linear pipeline
# will add supports for more complex structure
self._min = self.fixed_length
self._max = self.fixed_length + 1
else:
self.fixed_length = 0
self._min = 1
self._max = 3

self.operators = []
self.arguments = []

for key in sorted(self.config_dict.keys()):
op_class, arg_types = TPOTOperatorClassFactory(
op_class, _ = TPOTOperatorClassFactory(
key,
self.config_dict[key],
BaseClass=Operator,
ArgBaseClass=ARGType
)
if op_class:
self.operators.append(op_class)
self.arguments += arg_types

# Schedule TPOT to run for many generations if the user specifies a
# run-time limit TPOT will automatically interrupt itself when the timer
Expand Down Expand Up @@ -450,39 +461,83 @@ def _setup_pset(self):
self._pset = gp.PrimitiveSetTyped('MAIN', [np.ndarray], Output_Array)
self._pset.renameArguments(ARG0='input_matrix')
self._add_operators()
self._add_terminals()

if self.verbosity > 2:
print('{} operators have been imported by TPOT.'.format(len(self.operators)))


def _add_operators(self):
for operator in self.operators:
if operator.root:
# We need to add rooted primitives twice so that they can
# return both an Output_Array (and thus be the root of the tree),
# and return a np.ndarray so they can exist elsewhere in the tree.
p_types = (operator.parameter_types()[0], Output_Array)
self._pset.addPrimitive(operator, *p_types)

self._pset.addPrimitive(operator, *operator.parameter_types())

# Import required modules into local namespace so that pipelines
# may be evaluated directly
for key in sorted(operator.import_hash.keys()):
module_list = ', '.join(sorted(operator.import_hash[key]))

if key.startswith('tpot.'):
exec('from {} import {}'.format(key[4:], module_list))
main_type = ["Classifier", "Regressor", "Selector", "Transformer"]
ret_types = []
if self.template:
steps = self.template.split('-')
for idx, step in enumerate(steps):
if idx < self.fixed_length - 1:
# create an empty for returning class for strongly-type GP
step_ret_type_name = 'Ret_{}'.format(idx)
step_ret_type = type(step_ret_type_name, (object,), {})
ret_types.append(step_ret_type)
else:
exec('from {} import {}'.format(key, module_list))

for var in operator.import_hash[key]:
self.operators_context[var] = eval(var)
step_ret_type = Output_Array
# input class in each step
if idx:
step_in_type = ret_types[idx-1]
else:
step_in_type = np.ndarray
if main_type.count(step): # if the step is a main type
for operator in self.operators:
arg_types = operator.parameter_types()[0][1:]
if operator.type() == step:
p_types = ([step_in_type] + arg_types, step_ret_type)
self._pset.addPrimitive(operator, *p_types)
self._import_hash(operator)
self._add_terminals(arg_types)
else: # is the step is a specific operator
for operator in self.operators:
arg_types = operator.parameter_types()[0][1:]
if operator.__name__ == step:
p_types = ([step_in_type] + arg_types, step_ret_type)
self._pset.addPrimitive(operator, *p_types)
self._import_hash(operator)
self._add_terminals(arg_types)
else: # no template and randomly generated pipeline
for operator in self.operators:
arg_types = operator.parameter_types()[0][1:]
if operator.root:
# We need to add rooted primitives twice so that they can
# return both an Output_Array (and thus be the root of the tree),
# and return a np.ndarray so they can exist elsewhere in the tree.
p_types = (operator.parameter_types()[0], Output_Array)
self._pset.addPrimitive(operator, *p_types)

self._pset.addPrimitive(operator, *operator.parameter_types())

# Import required modules into local namespace so that pipelines
# may be evaluated directly
self._import_hash(operator)
self._add_terminals(arg_types)


self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray)
self.ret_types = [np.ndarray, Output_Array] + ret_types


def _import_hash(self, operator):
# Import required modules into local namespace so that pipelines
# may be evaluated directly
for key in sorted(operator.import_hash.keys()):
module_list = ', '.join(sorted(operator.import_hash[key]))

if key.startswith('tpot.'):
exec('from {} import {}'.format(key[4:], module_list))
else:
exec('from {} import {}'.format(key, module_list))

self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray)
for var in operator.import_hash[key]:
self.operators_context[var] = eval(var)

def _add_terminals(self):
for _type in self.arguments:
def _add_terminals(self, arg_types):
for _type in arg_types:
type_values = list(_type.values)

for val in type_values:
Expand All @@ -496,12 +551,16 @@ def _setup_toolbox(self):
creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMulti, statistics=dict)

self._toolbox = base.Toolbox()
self._toolbox.register('expr', self._gen_grow_safe, pset=self._pset, min_=1, max_=3)
self._toolbox.register('expr', self._gen_grow_safe, pset=self._pset, min_=self._min, max_=self._max)
self._toolbox.register('individual', tools.initIterate, creator.Individual, self._toolbox.expr)
self._toolbox.register('population', tools.initRepeat, list, self._toolbox.individual)
self._toolbox.register('compile', self._compile_to_sklearn)
self._toolbox.register('select', tools.selNSGA2)
self._toolbox.register('mate', self._mate_operator)
if self.fixed_length:
self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max)
else:
self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max + 1)
self._toolbox.register('expr_mut', self._gen_grow_safe, min_=1, max_=4)
self._toolbox.register('mutate', self._random_mutation_operator)

Expand Down Expand Up @@ -1352,15 +1411,17 @@ def _random_mutation_operator(self, individual, allow_shrink=True):
Returns the individual with one of the mutations applied to it

"""
mutation_techniques = [
partial(gp.mutInsert, pset=self._pset),
partial(mutNodeReplacement, pset=self._pset)
]

# We can't shrink pipelines with only one primitive, so we only add it if we find more primitives.
number_of_primitives = sum([isinstance(node, deap.gp.Primitive) for node in individual])
if number_of_primitives > 1 and allow_shrink:
mutation_techniques.append(partial(gp.mutShrink))
if self.fixed_length:
mutation_techniques = [partial(mutNodeReplacement, pset=self._pset)]
else:
mutation_techniques = [
partial(gp.mutInsert, pset=self._pset),
partial(mutNodeReplacement, pset=self._pset)
]
# We can't shrink pipelines with only one primitive, so we only add it if we find more primitives.
number_of_primitives = sum([isinstance(node, deap.gp.Primitive) for node in individual])
if number_of_primitives > 1 and allow_shrink:
mutation_techniques.append(partial(gp.mutShrink))

mutator = np.random.choice(mutation_techniques)

Expand All @@ -1370,23 +1431,14 @@ def _random_mutation_operator(self, individual, allow_shrink=True):
ind = self._toolbox.clone(individual)
offspring, = mutator(ind)
if str(offspring) not in self.evaluated_individuals_:
# Update statistics
# crossover_count is kept the same as for the predecessor
# mutation count is increased by 1
# predecessor is set to the string representation of the individual before mutation
# generation is set to 'INVALID' such that we can recognize that it should be updated accordingly
offspring.statistics['crossover_count'] = individual.statistics['crossover_count']
offspring.statistics['mutation_count'] = individual.statistics['mutation_count'] + 1
offspring.statistics['predecessor'] = (str(individual),)
offspring.statistics['generation'] = 'INVALID'
break
else:
unsuccesful_mutations += 1

# Sometimes you have pipelines for which every shrunk version has already been explored too.
# To still mutate the individual, one of the two other mutators should be applied instead.
if ((unsuccesful_mutations == 50) and
(type(mutator) is partial and mutator.func is gp.mutShrink)):
(type(mutator) is partial and mutator.func is gp.mutShrink)):
offspring, = self._random_mutation_operator(individual, allow_shrink=False)

return offspring,
Expand Down Expand Up @@ -1414,7 +1466,7 @@ def _gen_grow_safe(self, pset, min_, max_, type_=None):

def condition(height, depth, type_):
"""Stop when the depth is equal to height or when a node should be a terminal."""
return type_ not in [np.ndarray, Output_Array] or depth == height
return type_ not in self.ret_types or depth == height

return self._generate(pset, min_, max_, condition, type_)

Expand Down
8 changes: 0 additions & 8 deletions tpot/gp_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,3 @@
class Output_Array(object):
"""Final output data type of pipelines."""
pass

class Transformed_Array(object):
"""Transformed data returned by Transformer."""
pass

class Selected_Array(object):
"""Transformed data returned by Selector."""
pass
4 changes: 2 additions & 2 deletions tpot/operator_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,9 @@ def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass=
elif issubclass(op_obj, RegressorMixin):
class_profile['root'] = True
optype = "Regressor"
elif issubclass(op_obj, TransformerMixin):
if issubclass(op_obj, TransformerMixin):
optype = "Transformer"
elif issubclass(op_obj, SelectorMixin):
if issubclass(op_obj, SelectorMixin):
optype = "Selector"

@classmethod
Expand Down