weixuanfu · weixuanfu · Apr 26, 2018 · Apr 16, 2018 · Apr 16, 2018 · Apr 16, 2018
diff --git a/tpot/base.py b/tpot/base.py
@@ -70,7 +70,7 @@
 from .config.classifier_sparse import classifier_config_sparse
 
 from .metrics import SCORERS
-from .gp_types import Output_Array, Transformed_Array, Selected_Array
+from .gp_types import Output_Array
 from .gp_deap import eaMuPlusLambda, mutNodeReplacement, _wrapped_cross_val_score, cxOnePoint
 
 # hot patch for Windows: solve the problem of crashing python after Ctrl + C in Windows OS
@@ -200,7 +200,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
             String 'TPOT sparse':
                 TPOT uses a configuration dictionary with a one-hot-encoder and the
                 operators normally included in TPOT that also support sparse matrices.
-        template: (default: None)
+        template: Python list (default: None)
             A template for pipeline structure
         warm_start: bool, optional (default: False)
             Flag indicating whether the TPOT instance will reuse the population from
@@ -286,18 +286,29 @@ def __init__(self, generations=100, population_size=100, offspring_size=None,
         self.config_dict_params = config_dict
         self._setup_config(self.config_dict_params)
 
+        self.template = template
+        if template:
+            self.fixed_length = len(template.split('-'))
+            # for now, the template is only a linear pipeline
+            # will add supports for more complex structure
+            self._min = self.fixed_length
+            self._max = self.fixed_length + 1
+        else:
+            self.fixed_length = 0
+            self._min = 1
+            self._max = 3
+
         self.operators = []
-        self.arguments = []
+
         for key in sorted(self.config_dict.keys()):
-            op_class, arg_types = TPOTOperatorClassFactory(
+            op_class, _ = TPOTOperatorClassFactory(
                 key,
                 self.config_dict[key],
                 BaseClass=Operator,
                 ArgBaseClass=ARGType
             )
             if op_class:
                 self.operators.append(op_class)
-                self.arguments += arg_types
 
         # Schedule TPOT to run for many generations if the user specifies a
         # run-time limit TPOT will automatically interrupt itself when the timer
@@ -450,39 +461,83 @@ def _setup_pset(self):
         self._pset = gp.PrimitiveSetTyped('MAIN', [np.ndarray], Output_Array)
         self._pset.renameArguments(ARG0='input_matrix')
         self._add_operators()
-        self._add_terminals()
 
         if self.verbosity > 2:
             print('{} operators have been imported by TPOT.'.format(len(self.operators)))
 
+
     def _add_operators(self):
-        for operator in self.operators:
-            if operator.root:
-                # We need to add rooted primitives twice so that they can
-                # return both an Output_Array (and thus be the root of the tree),
-                # and return a np.ndarray so they can exist elsewhere in the tree.
-                p_types = (operator.parameter_types()[0], Output_Array)
-                self._pset.addPrimitive(operator, *p_types)
-
-            self._pset.addPrimitive(operator, *operator.parameter_types())
-
-            # Import required modules into local namespace so that pipelines
-            # may be evaluated directly
-            for key in sorted(operator.import_hash.keys()):
-                module_list = ', '.join(sorted(operator.import_hash[key]))
-
-                if key.startswith('tpot.'):
-                    exec('from {} import {}'.format(key[4:], module_list))
+        main_type = ["Classifier", "Regressor", "Selector", "Transformer"]
+        ret_types = []
+        if self.template:
+            steps = self.template.split('-')
+            for idx, step in enumerate(steps):
+                if idx < self.fixed_length - 1:
+                    # create an empty for returning class for strongly-type GP
+                    step_ret_type_name = 'Ret_{}'.format(idx)
+                    step_ret_type = type(step_ret_type_name, (object,), {})
+                    ret_types.append(step_ret_type)
                 else:
-                    exec('from {} import {}'.format(key, module_list))
-
-                for var in operator.import_hash[key]:
-                    self.operators_context[var] = eval(var)
+                    step_ret_type = Output_Array
+                # input class in each step
+                if idx:
+                    step_in_type = ret_types[idx-1]
+                else:
+                    step_in_type = np.ndarray
+                if main_type.count(step): # if the step is a main type
+                    for operator in self.operators:
+                        arg_types =  operator.parameter_types()[0][1:]
+                        if operator.type() == step:
+                            p_types = ([step_in_type] + arg_types, step_ret_type)
+                            self._pset.addPrimitive(operator, *p_types)
+                            self._import_hash(operator)
+                            self._add_terminals(arg_types)
+                else: # is the step is a specific operator
+                    for operator in self.operators:
+                        arg_types =  operator.parameter_types()[0][1:]
+                        if operator.__name__ == step:
+                            p_types = ([step_in_type] + arg_types, step_ret_type)
+                            self._pset.addPrimitive(operator, *p_types)
+                            self._import_hash(operator)
+                            self._add_terminals(arg_types)
+        else: # no template and randomly generated pipeline
+            for operator in self.operators:
+                arg_types =  operator.parameter_types()[0][1:]
+                if operator.root:
+                    # We need to add rooted primitives twice so that they can
+                    # return both an Output_Array (and thus be the root of the tree),
+                    # and return a np.ndarray so they can exist elsewhere in the tree.
+                    p_types = (operator.parameter_types()[0], Output_Array)
+                    self._pset.addPrimitive(operator, *p_types)
+
+                self._pset.addPrimitive(operator, *operator.parameter_types())
+
+                # Import required modules into local namespace so that pipelines
+                # may be evaluated directly
+                self._import_hash(operator)
+                self._add_terminals(arg_types)
+
+
+            self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray)
+        self.ret_types = [np.ndarray, Output_Array] + ret_types
+
+
+    def _import_hash(self, operator):
+        # Import required modules into local namespace so that pipelines
+        # may be evaluated directly
+        for key in sorted(operator.import_hash.keys()):
+            module_list = ', '.join(sorted(operator.import_hash[key]))
+
+            if key.startswith('tpot.'):
+                exec('from {} import {}'.format(key[4:], module_list))
+            else:
+                exec('from {} import {}'.format(key, module_list))
 
-        self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray)
+            for var in operator.import_hash[key]:
+                self.operators_context[var] = eval(var)
 
-    def _add_terminals(self):
-        for _type in self.arguments:
+    def _add_terminals(self, arg_types):
+        for _type in arg_types:
             type_values = list(_type.values)
 
             for val in type_values:
@@ -496,12 +551,16 @@ def _setup_toolbox(self):
             creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMulti, statistics=dict)
 
         self._toolbox = base.Toolbox()
-        self._toolbox.register('expr', self._gen_grow_safe, pset=self._pset, min_=1, max_=3)
+        self._toolbox.register('expr', self._gen_grow_safe, pset=self._pset, min_=self._min, max_=self._max)
         self._toolbox.register('individual', tools.initIterate, creator.Individual, self._toolbox.expr)
         self._toolbox.register('population', tools.initRepeat, list, self._toolbox.individual)
         self._toolbox.register('compile', self._compile_to_sklearn)
         self._toolbox.register('select', tools.selNSGA2)
         self._toolbox.register('mate', self._mate_operator)
+        if self.fixed_length:
+            self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max)
+        else:
+            self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max + 1)
         self._toolbox.register('expr_mut', self._gen_grow_safe, min_=1, max_=4)
         self._toolbox.register('mutate', self._random_mutation_operator)
 
@@ -1352,15 +1411,17 @@ def _random_mutation_operator(self, individual, allow_shrink=True):
             Returns the individual with one of the mutations applied to it
 
         """
-        mutation_techniques = [
-            partial(gp.mutInsert, pset=self._pset),
-            partial(mutNodeReplacement, pset=self._pset)
-        ]
-
-        # We can't shrink pipelines with only one primitive, so we only add it if we find more primitives.
-        number_of_primitives = sum([isinstance(node, deap.gp.Primitive) for node in individual])
-        if number_of_primitives > 1 and allow_shrink:
-            mutation_techniques.append(partial(gp.mutShrink))
+        if self.fixed_length:
+            mutation_techniques = [partial(mutNodeReplacement, pset=self._pset)]
+        else:
+            mutation_techniques = [
+                partial(gp.mutInsert, pset=self._pset),
+                partial(mutNodeReplacement, pset=self._pset)
+            ]
+            # We can't shrink pipelines with only one primitive, so we only add it if we find more primitives.
+            number_of_primitives = sum([isinstance(node, deap.gp.Primitive) for node in individual])
+            if number_of_primitives > 1 and allow_shrink:
+                mutation_techniques.append(partial(gp.mutShrink))
 
         mutator = np.random.choice(mutation_techniques)
 
@@ -1370,23 +1431,14 @@ def _random_mutation_operator(self, individual, allow_shrink=True):
             ind = self._toolbox.clone(individual)
             offspring, = mutator(ind)
             if str(offspring) not in self.evaluated_individuals_:
-                # Update statistics
-                # crossover_count is kept the same as for the predecessor
-                # mutation count is increased by 1
-                # predecessor is set to the string representation of the individual before mutation
-                # generation is set to 'INVALID' such that we can recognize that it should be updated accordingly
-                offspring.statistics['crossover_count'] = individual.statistics['crossover_count']
-                offspring.statistics['mutation_count'] = individual.statistics['mutation_count'] + 1
-                offspring.statistics['predecessor'] = (str(individual),)
-                offspring.statistics['generation'] = 'INVALID'
                 break
             else:
                 unsuccesful_mutations += 1
 
         # Sometimes you have pipelines for which every shrunk version has already been explored too.
         # To still mutate the individual, one of the two other mutators should be applied instead.
         if ((unsuccesful_mutations == 50) and
-                (type(mutator) is partial and mutator.func is gp.mutShrink)):
+           (type(mutator) is partial and mutator.func is gp.mutShrink)):
             offspring, = self._random_mutation_operator(individual, allow_shrink=False)
 
         return offspring,
@@ -1414,7 +1466,7 @@ def _gen_grow_safe(self, pset, min_, max_, type_=None):
 
         def condition(height, depth, type_):
             """Stop when the depth is equal to height or when a node should be a terminal."""
-            return type_ not in [np.ndarray, Output_Array] or depth == height
+            return type_ not in self.ret_types or depth == height
 
         return self._generate(pset, min_, max_, condition, type_)
 

diff --git a/tpot/gp_types.py b/tpot/gp_types.py
@@ -26,11 +26,3 @@
 class Output_Array(object):
     """Final output data type of pipelines."""
     pass
-
-class Transformed_Array(object):
-    """Transformed data returned by Transformer."""
-    pass
-
-class Selected_Array(object):
-    """Transformed data returned by Selector."""
-    pass
diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py
@@ -169,9 +169,9 @@ def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass=
         elif issubclass(op_obj, RegressorMixin):
             class_profile['root'] = True
             optype = "Regressor"
-        elif issubclass(op_obj, TransformerMixin):
+        if issubclass(op_obj, TransformerMixin):
             optype = "Transformer"
-        elif issubclass(op_obj, SelectorMixin):
+        if issubclass(op_obj, SelectorMixin):
             optype = "Selector"
 
         @classmethod