# Regression w/ DEAP
Vítor Amorim Fróis - 12543440

Classification example using STGP (Strongly Typed Genetic Programming). The evolved programs work on floating-point values AND Booleans values. The programs must return a Boolean value which must be true if e-mail is spam, and false otherwise. It uses a base of emails (saved in spambase.csv, see Reference), from which it randomly chooses 400 emails to evaluate each individual.

In [106]:
import random
import operator
import csv
import itertools


import numpy as np

from functools import partial

from sklearn.datasets import load_diabetes
from sklearn.utils import resample
from sklearn.metrics import r2_score

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp

### Get dataset

In [127]:
dataset = load_diabetes()
dataset_len = len(dataset.data)

### Primitives set
Similar to the proposed classification exercise

In [176]:
# defined a new primitive set for strongly typed GP
pset = gp.PrimitiveSetTyped("MAIN", itertools.repeat(float, dataset.data.shape[1]), float, "IN")

# floating point operators
# Define a protected division function
def protectedDiv(left, right):
    if abs(right) < 1e-10:
        return 1
    try: 
        return left / right
    except ZeroDivisionError: 
        return 1

pset.addPrimitive(operator.add, [float,float], float)
pset.addPrimitive(operator.sub, [float,float], float)
pset.addPrimitive(operator.mul, [float,float], float)
pset.addPrimitive(protectedDiv, [float,float], float)

def if_then_else(input, output1, output2):
    if input: return output1
    else: return output2

pset.addPrimitive(operator.lt, [float, float], bool)
pset.addPrimitive(operator.eq, [float, float], bool)
pset.addPrimitive(if_then_else, [bool, float, float], float)

pset.addTerminal(False, bool)
pset.addTerminal(True, bool)

# terminals
pset.addTerminal(1, float)
pset.addTerminal(-1, float)
pset.addTerminal(-100, float)
pset.addTerminal(100, float)

### Fitness Function

In [177]:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax)



### Individual

In [178]:
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=2, max_=6)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

In [179]:
tree = gp.PrimitiveTree(toolbox.individual())
print(tree)

mul(mul(IN5, IN3), sub(IN7, IN5))


### Print a Tree

In [180]:
expr = toolbox.individual()
print(expr)
nodes, edges, labels = gp.graph(expr)

protectedDiv(protectedDiv(IN6, IN8), mul(IN3, IN9))


### Evaluation Function
sample the dataset and get the R2 Score

In [181]:
def evalRegression(individual):
    # Transform the tree expression in a callable function
    func = toolbox.compile(expr=individual)
    # Randomly sample 400 mails in the spam database
    bootstrap_index = resample(list(range(dataset_len)), n_samples=dataset_len)

    X = dataset.data[bootstrap_index]
    y_true = dataset.target[bootstrap_index]
    
    # Evaluate the sum of correctly identified mail as spam
    y_pred = [func(*X_sample) for X_sample in X]
    score = r2_score(y_pred, y_true)
    return score,

toolbox.register("evaluate", evalRegression)

In [185]:
func = toolbox.compile(expr=expr)
evalRegression(expr)

(-0.0018045714316283767,)

### Genetic Operators

In [183]:
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=1)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

### Main program

In [184]:
def main():
    random.seed(10)
    pop = toolbox.population(n=100)
    hof = tools.HallOfFame(5)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", numpy.mean)
    stats.register("std", numpy.std)
    stats.register("min", numpy.min)
    stats.register("max", numpy.max)

    algorithms.eaSimple(pop, toolbox, 0.2, 0.5, 100, stats, halloffame=hof)

    return pop, stats, hof

if __name__ == "__main__":
    ret = main()

gen	nevals	avg         	std        	min         	max      
0  	100   	-4.67787e+13	4.54075e+14	-4.56411e+15	0.0737379
1  	66    	-7.06852e+11	7.00576e+12	-7.04128e+13	0.0737379
2  	63    	-1.23052e+09	1.2243e+10 	-1.23047e+11	0.0737379
3  	61    	-134144     	748359     	-5.41392e+06	0.105407 
4  	53    	-1.78116e+09	1.38641e+10	-1.3194e+11 	0.105407 
5  	57    	-2.74899e+09	1.88345e+10	-1.37288e+11	0.105407 
6  	54    	-1.331e+08  	9.12462e+08	-6.93007e+09	0.10908  
7  	61    	-1.51303e+09	1.0392e+10 	-8.71457e+10	0.113049 
8  	64    	-2.89509e+06	2.65235e+07	-2.66461e+08	0.122355 
9  	56    	-8.4889e+07 	5.99809e+08	-4.98152e+09	0.148868 
10 	61    	-1.27777e+06	4.58479e+06	-3.19436e+07	0.113806 
11 	67    	-5.61885e+07	5.50251e+08	-5.53103e+09	0.132282 
12 	57    	-1.2476e+09 	1.24082e+10	-1.24707e+11	0.132282 
13 	53    	-7.01946e+09	6.98293e+10	-7.01812e+11	0.132282 
14 	69    	-2.29615e+06	1.7822e+07 	-1.77958e+08	0.132282 
15 	59    	-1.1778e+08 	8.55377e+08	-7.54229e+09	0.13228

In [100]:
tree = gp.PrimitiveTree(ret[2])
tree

[[<deap.gp.Primitive at 0x7f85609b9a30>,
  <deap.gp.Primitive at 0x7f85609ba8e0>,
  <deap.gp.Primitive at 0x7f85609c81d0>,
  <deap.gp.Primitive at 0x7f85609c81d0>,
  <deap.gp.Primitive at 0x7f85609b9a30>,
  <deap.gp.Primitive at 0x7f85609b9a30>,
  <deap.gp.Primitive at 0x7f85609baa70>,
  <deap.gp.Primitive at 0x7f85609ba8e0>,
  <deap.gp.Terminal at 0x7f8560c04940>,
  <deap.gp.Terminal at 0x7f8560c069c0>,
  <deap.gp.Primitive at 0x7f85609baa70>,
  <deap.gp.Terminal at 0x7f8560c249c0>,
  <deap.gp.Terminal at 0x7f8560c249c0>,
  <deap.gp.Terminal at 0x7f8560c07440>,
  <deap.gp.Primitive at 0x7f85609ba8e0>,
  <deap.gp.Primitive at 0x7f85609ba8e0>,
  <deap.gp.Primitive at 0x7f85609b9a30>,
  <deap.gp.Terminal at 0x7f8560c07440>,
  <deap.gp.Primitive at 0x7f85609ba8e0>,
  <deap.gp.Terminal at 0x7f8560c07440>,
  <deap.gp.Terminal at 0x7f8560c26940>,
  <deap.gp.Primitive at 0x7f85609b9a30>,
  <deap.gp.Terminal at 0x7f8560c05cc0>,
  <deap.gp.Terminal at 0x7f8560c04940>,
  <deap.gp.Terminal at 0x7

In [186]:
tree = gp.PrimitiveTree(ret[0][3])
print(tree)

evalRegression(tree)

mul(sub(protectedDiv(protectedDiv(IN7, IN2), add(IN7, protectedDiv(mul(IN2, add(add(IN8, IN3), IN1)), 100))), -100), mul(IN2, 100))


(0.15469884428440261,)

We achieved a positive R2 score, but still not a good :(