In [3]:
%pylab inline
# Pandas is a nice utilitiy that enables some easy data manipulation, especially from a csv
import pandas as pd
# Numpy lets us work with arrays
import numpy as np

# Sklearn provides various modules with a common API
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate

!pip install deap
from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp

import random
import operator
import itertools

# import pickle
import pickle
# turn off warnings
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +




In [4]:
# Read in our csv files downloaded from Kaggle
train_data = pd.read_csv('processed_train.csv').set_index("PassengerId")
test_data = pd.read_csv('processed_test.csv').set_index("PassengerId")

df = pd.concat([train_data, test_data], axis=0, sort=False)

In [5]:
# We can use .head() to get just the first few rows of the data.
train_data.head()
# Note how the notebook nicely displays the output for us

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,LastName,IsWomanOrBoy,WomanOrBoyCount,FamilySurvivedCount,WomanOrBoySurvived,Alone,Single,SmallF,MedF,LargeF,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,3,1,22.0,1,0,7.25,2,10,100,0,0,0.0,0.0,1,0,1,0,0,7
2,1.0,1,0,38.0,1,0,71.2833,0,11,182,1,1,1.0,1.0,0,0,1,0,0,2
3,1.0,3,0,26.0,0,0,7.925,2,9,329,1,1,1.0,1.0,0,1,0,0,0,7
4,1.0,1,0,35.0,1,0,53.1,2,11,267,1,1,1.0,1.0,0,0,1,0,0,2
5,0.0,3,1,35.0,0,0,8.05,2,10,15,0,1,1.0,1.0,0,1,0,0,0,7


## Generate valuable constants from df

In [6]:
# average age
mean_age = df.Age.mean()

# numbers 2, 3, 4
zero = 0
one = 1
two = 2



In [7]:
RANDOM_SEED=10
X_train = train_data.loc[:, train_data.columns != 'Survived']
y_train = train_data.loc[:, 'Survived']
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=RANDOM_SEED)

In [8]:
# create fitness for MO: FP and FN
creator.create("FitnessMin", base.Fitness, weights=(-1.0,-1.0))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)

In [9]:
# helper functions for primitives
def divide_by_zero(a, b):
    if b == 0:
        return 0
    else:
        return a / b
def if_then_else(input, output1, output2):
    if input: 
        return output1
    else: 
        return output2

def float_to_bool(f):
    return f > 0

In [10]:
# create primitive set
random.seed(25)
pset = gp.PrimitiveSetTyped("main", itertools.repeat(float, 8), bool) 

pset.addPrimitive(operator.add, [float, float], float)
pset.addPrimitive(operator.sub, [float, float], float)
pset.addPrimitive(operator.mul, [float, float], float)
pset.addPrimitive(divide_by_zero, [float, float], float)

pset.addPrimitive(operator.and_, [bool, bool], bool)
pset.addPrimitive(operator.or_, [bool, bool], bool)
pset.addPrimitive(operator.not_, [bool], bool)
pset.addPrimitive(operator.xor, [bool, bool], bool)
pset.addPrimitive(float_to_bool, [float], bool)

pset.addPrimitive(operator.lt, [float, float], bool)
pset.addPrimitive(operator.eq, [float, float], bool)
pset.addPrimitive(if_then_else, [bool, float, float], float)

pset.addTerminal(0, bool)
pset.addTerminal(1, bool)

pset.renameArguments(ARG0='Pclass', ARG1='Sex', ARG2='Age', ARG3='SibSp', ARG4='Parch', ARG5="Fare", ARG6="Embarked", ARG7="Deck")

In [11]:
# create toolbox
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=10)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

In [12]:
def eval_function(individual, points, pset):
    func = gp.compile(expr=individual, pset=pset)
    results = [func(*points[x]) for x in range(len(points))]
    tn, fp, fn, tp = confusion_matrix(y_train, results).ravel()
    return fp/(tn + fp), fn/(tp + fn)

In [13]:
toolbox.register("evaluate", eval_function, points=X_train.values, pset=pset)
toolbox.register("select", tools.selNSGA2)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=2, max_=5)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))