Data uploading

In [None]:
from google.colab import files
train_uploaded = files.upload()
test_uploaded = files.upload()

Saving train.csv to train (1).csv


Saving test.csv to test (2).csv


Library loading

In [None]:
!pip install deap



In [None]:
import pandas as pd
import numpy as np
import random as rnd
import io

from deap import base
from deap import creator
from deap import tools

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import make_pipeline

In [None]:
train_df = pd.read_csv(io.BytesIO(train_uploaded['train.csv']))
test_df = pd.read_csv(io.BytesIO(test_uploaded['test.csv']))
dfs = [train_df, test_df]
full_origin = pd.concat([train_df.drop('Survived', 1), test_df])

### Data preprocessing

In [None]:
def clean_ticket(ticket):
  original_ticket = ticket
  ticket = ticket.replace('.', '')
  ticket = ticket.replace('/', '')
  ticket = ticket.split()
  ticket = map(lambda x: x.strip(), ticket)
  ticket = list(filter(lambda x: not x.isdigit(), ticket))

  if len(ticket) > 0:
    return ticket[0]
  else:
    return 'XXX'

Grouping according to Age 

In [None]:
def simplify_age(df):
  df.Age = df.Age.fillna(-0.5)
  bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
  group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
  categories = pd.cut(df.Age, bins, labels=group_names)
  df['AgeBins'] = categories
  
  return df

Grouping according to Fare 

In [None]:
def simplify_fares(df):
  df.Fare = df.Fare.fillna(-0.5)
  bins = (-1, 0, 8, 15, 31, 1000)
  group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
  categories = pd.cut(df.Fare, bins, labels=group_names)
  df['FareBins'] = categories

  return df

Map features to Numeric values

In [None]:
def encode_features(df_train, df_test, features):
    df_combined = pd.concat([df_train[features], df_test[features]])    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test

Creating Title feature and mapping some synonyms

In [None]:
for dataset in dfs:
  dataset['Title'] = dataset['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
  dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
  dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
  dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

Converting Sex categorical feature to int value

In [None]:
for dataset in dfs:
  dataset['Sex'] = dataset['Sex'].map({'female': 1, 'male': 0}).astype(int)

Checking Age missing values

In [None]:
for dataset in dfs:
    dataset['Age_known'] = dataset['Age'].isnull() == False

Filling in missing values: Embarked

In [None]:
train_df['Embarked'].iloc[61] = 'C'
train_df['Embarked'].iloc[829] = 'C'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Filling in missing values: Fare

In [None]:
all_df = pd.concat([train_df.drop('Survived',1), test_df])
test_df['Fare'].iloc[152] = all_df['Fare'][all_df['Pclass'] == 3].dropna().median()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Filling in missing values: Age

In [None]:
all_df = pd.concat([train_df.drop('Survived', 1), test_df])
titleList = all_df['Title'].unique().tolist()
guess_ages_sex_title = np.zeros((2, len(titleList)))
guess_ages_sex_pclass = np.zeros((2, 3))

for dataset in dfs:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = all_df[(all_df['Sex'] == i) & (all_df['Pclass'] == j + 1)]['Age'].dropna()
            age_guess = guess_df.median()
            guess_ages_sex_pclass[i, j] = int( age_guess / 0.5 + 0.5 ) * 0.5
            for title in titleList:
                k = titleList.index(title)
                guess_df = all_df[(all_df['Title'] == title) & (all_df['Sex'] == i)]['Age'].dropna()
                                
                age_guess2 = guess_df.median()                
                if (age_guess2 != age_guess2):
                    age_guess2 = age_guess
                
                guess_ages_sex_title[i, k] = int(age_guess2 / 0.5 + 0.5) * 0.5
    
    for i in range(0, 2):
        for j in range(0, 3):
            for k in range(0, len(titleList)):
                dataset.loc[(dataset.Age.isnull()) & (dataset.Title == titleList[k]) & (dataset.Sex == i), 'Age'] = guess_ages_sex_title[i, k]
            dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j + 1), 'Age'] = guess_ages_sex_pclass[i, j]

    dataset['Age'] = dataset['Age'].astype(int)

Creating Deck and FamilyName features

In [None]:
for dataset in dfs:    
    dataset['Deck'] = dataset['Cabin'].str[0]
    dataset['FamilyName'] = dataset['Name'].str.split(', ', expand=True)[0]

In [None]:
all_df = pd.concat([train_df.drop('Survived', 1),test_df])
print('Initial missing Deck values: ', len(all_df.loc[all_df['Deck'].isnull()]))

Initial missing Deck values:  1014


Guessing Deck missing values from the Ticket value

In [None]:
TicketList = all_df['Ticket'].unique().tolist()
for dataset in dfs:    
    for ticket in TicketList:
        guess_deck = all_df[(all_df['Ticket'] == ticket)]['Deck'].dropna()
        if(len(guess_deck.index) > 0):
            guess_deck = guess_deck.iloc[0][0]
            dataset.loc[(dataset.Deck.isnull()) & (dataset.Ticket == ticket), 'Deck'] = guess_deck
all_df = pd.concat([train_df.drop('Survived',1),test_df])
print('Missing Deck values after apply shared ticket heuristic: ',len(all_df.loc[all_df['Deck'].isnull()]))

Missing Deck values after apply shared ticket heuristic:  998


Trying to guess Deck missing values using a prediction Model

In [None]:
all_df = pd.concat([train_df,test_df])
df = all_df[['Pclass','Fare','Embarked','Deck']]
df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)   
df1 = df[df['Deck'].notnull()]
XDeck_train = df1[['Pclass','Fare','Embarked']]
YDeck_train = df1.Deck
random_forest = RandomForestClassifier(n_estimators=100)

scores = cross_val_score(random_forest, XDeck_train, YDeck_train, cv=5, n_jobs=-1)
print('CV score: ',scores.mean())

random_forest.fit(XDeck_train, YDeck_train)
XDeck_test = df[['Pclass','Fare','Embarked']]
YDeck_pred = random_forest.predict(XDeck_test)
all_df['DeckPred'] = YDeck_pred
all_df.loc[(all_df.Deck.isnull()), 'Deck'] = all_df.loc[(all_df.Deck.isnull()), 'DeckPred']
train_df['Deck'] = all_df[ 0:891 ]['Deck']
test_df['Deck'] = all_df[ 891: ]['Deck']
                
all_df = pd.concat([train_df.drop('Survived',1),test_df])
print('Missing Deck values: ',len(all_df.loc[all_df['Deck'].isnull()]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


CV score:  0.6946236559139785
Missing Deck values:  0


Creating some new features

In [None]:
all_df = pd.concat([train_df.drop('Survived', 1), test_df])

for dataset in dfs:    
    simplify_fares(dataset)
    simplify_age(dataset)
    dataset['Child'] = dataset['Age'] <= 10
    dataset['MedianAge'] = (dataset['Age'] >= 18) & (dataset['Age'] <= 40)
    dataset['Young_m'] = (dataset['Age'] >= 18) & (dataset['Age'] <= 40) & (dataset['Sex'] == 0)
    dataset['Young_f'] = (dataset['Age'] >= 18) & (dataset['Age'] <= 40) & (dataset['Sex'] == 1)
    dataset['Family'] = dataset['SibSp'] + dataset['Parch']
    dataset['Alone']  = (dataset['SibSp'] + dataset['Parch']) == 0
    dataset['Cabin_known'] = dataset['Cabin'].isnull() == False
    dataset['Cabin_known'] = dataset['Cabin_known'].astype('int')    
    dataset['Ttype'] = dataset['Ticket'].str[0]
    dataset['Ttype2'] = dataset['Ticket'].map(clean_ticket)    
    dataset['Bad_ticket'] = dataset['Ttype'].isin(['3','4','5','6','7','8','A','L','W'])
    dataset['NameLength'] = dataset['Name'].apply(lambda x: len(x))          
    dataset['Ticket_group'] = dataset.groupby('Ticket')['Name'].transform('count')
    dataset['Fare_eff'] = dataset['Fare'] / dataset['Ticket_group']
    dataset['Shared_ticket'] = 3
    for i in range(len(dataset)):
        if dataset['Shared_ticket'].iloc[i] == 3:            
            if ((len(all_df.groupby('Ticket').get_group(dataset['Ticket'].iloc[i]))) > 1 ):
                dataset.loc[dataset['Ticket'] == dataset['Ticket'].iloc[i], 'Shared_ticket'] = 1
            else:
                dataset.loc[dataset['Ticket'] == dataset['Ticket'].iloc[i], 'Shared_ticket'] = 0
    
    dataset['Young'] = (dataset['Age'] <= 20) | (dataset['Title'].isin(['Master','Miss','Mlle','Mme']))

    dataset['FareBand'] = 0
    dataset.loc[ dataset['Fare'] <= 7.91, 'FareBand'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'FareBand'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'FareBand']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'FareBand'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

    dataset['AgeBand'] = 0
    dataset.loc[ dataset['Age'] <= 16, 'AgeBand'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'AgeBand'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'AgeBand'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'AgeBand'] = 3
    dataset.loc[ dataset['Age'] > 64, 'AgeBand'] = 4    
    
Title_Dictionary = {
                    "Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "the Countess":"Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Mrs",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"
}

for dataset in dfs:      
    dataset[ 'Title' ] = dataset.Title.map( Title_Dictionary )
    title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Officer': 5, 'Royalty': 6}    
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    dataset['Title'] = dataset['Title'].astype(int)    
    dataset['Ttype'] = dataset['Ttype'].map( {'1': 1, '2': 2, '3': 3, '4': 4,'5': 5,'6': 6, '7': 7, '8': 8,'9': 9,'A': 10, 'C': 11, 'F': 12,'L': 13, 'P': 14, 'S': 15,'W': 16} ).astype(int)    
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)    
    dataset['Deck'] = dataset['Deck'].map( {'U': 0, 'C': 1, 'E': 2,'G': 3, 'D': 4, 'A': 5,'B': 6, 'F': 7, 'T': 8} ).astype(int)    
    for col in dataset.columns:
        if(dataset[col].dtype == 'bool'):
            dataset[col] = dataset[col].astype(int)

In [None]:
train_df, test_df = encode_features(train_df, test_df, ['FamilyName', 'AgeBins', 'FareBins', 'Ttype2'])

In [None]:
selCols = []

for col in test_df.columns:
    if(test_df[col].dtype == 'int64' or test_df[col].dtype == 'float64' or test_df[col].dtype == 'uint8'):
        selCols.append(col)        

if 'PassengerId' in selCols: selCols.remove('PassengerId')
if 'Survived' in selCols: selCols.remove('Survived')

train_df = train_df.loc[:,selCols+['Survived']]
test_df = test_df.loc[:,selCols+['PassengerId']]
train_df.head()

print('Number of selected cols ', len(selCols), ' :',selCols)
print()
all_df = pd.concat([train_df,test_df])
print(all_df.describe())

Number of selected cols  30  : ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'Age_known', 'Deck', 'FamilyName', 'FareBins', 'AgeBins', 'Child', 'MedianAge', 'Young_m', 'Young_f', 'Family', 'Alone', 'Cabin_known', 'Ttype', 'Ttype2', 'Bad_ticket', 'NameLength', 'Ticket_group', 'Fare_eff', 'Shared_ticket', 'Young', 'FareBand', 'AgeBand']

            Pclass          Sex  ...    Survived  PassengerId
count  1309.000000  1309.000000  ...  891.000000   418.000000
mean      2.294882     0.355997  ...    0.383838  1100.500000
std       0.837836     0.478997  ...    0.486592   120.810458
min       1.000000     0.000000  ...    0.000000   892.000000
25%       2.000000     0.000000  ...    0.000000   996.250000
50%       3.000000     0.000000  ...    0.000000  1100.500000
75%       3.000000     1.000000  ...    1.000000  1204.750000
max       3.000000     1.000000  ...    1.000000  1309.000000

[8 rows x 32 columns]


### Model Training

1. RandomForest

In [None]:
colsRF =  ['Pclass', 'Sex', 'Embarked', 'Title', 'Age_known', 'Deck', 'FareBins', 'AgeBins', 'Alone', 'Ttype', 'Ttype2', 'NameLength', 'Young']
tcols = np.append(['Survived'], colsRF)
df = train_df.loc[:, tcols].dropna()
X_train = df.loc[:, colsRF]
Y_train = np.ravel(df.loc[:, ['Survived']])

model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, X_train, Y_train, cv=5, n_jobs=-1)
cv_rf_score = scores.mean()

print('RF CV score: ',scores.mean())
model.fit( X_train , Y_train )
print('Training score: ',model.score(X_train, Y_train))

RF CV score:  0.8024668884564686
Training score:  0.9842873176206509


2. K-Nearest Neighbour (KNN)

In [None]:
colsRF =  ['Pclass', 'Sex', 'Embarked', 'Title', 'Age_known', 'Deck', 'FareBins', 'AgeBins', 'Alone', 'Ttype', 'Ttype2', 'NameLength', 'Young']
tcols = np.append(['Survived'], colsRF)
df = train_df.loc[:, tcols].dropna()
X_train = df.loc[:, colsRF]
Y_train = np.ravel(df.loc[:, ['Survived']])
model = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(model, X_train, Y_train, cv=5, n_jobs=-1)
cv_knn_score = scores.mean()

print('KNN CV score: ',scores.mean())
model.fit( X_train , Y_train )
print('Training score: ',model.score(X_train, Y_train))

KNN CV score:  0.7239219132508945
Training score:  0.797979797979798


3. Support Vector Machine

In [None]:
colsRF =  ['Pclass', 'Sex', 'Embarked', 'Title', 'Age_known', 'Deck', 'FareBins', 'AgeBins', 'Alone', 'Ttype', 'Ttype2', 'NameLength', 'Young']
tcols = np.append(['Survived'], colsRF)
df = train_df.loc[:, tcols].dropna()
X_train = df.loc[:, colsRF]
Y_train = np.ravel(df.loc[:, ['Survived']])
scaler = preprocessing.StandardScaler().fit(X_train)    
X_train = scaler.transform(X_train)

model = SVC(kernel='rbf')
scores = cross_val_score(model, X_train, Y_train, cv=5, n_jobs=-1)
cv_svm_score = scores.mean()

print('SVM CV score: ', scores.mean())
model.fit( X_train , Y_train )
print('Training score: ', model.score(X_train, Y_train))

SVM CV score:  0.8148138848785387
Training score:  0.8507295173961841


### Implementing SVM with Genetic Algorithm

In [None]:
cols = selCols
training = pd.concat([train_df])

Random generator of SVM parameter C

In [None]:
def getC():
  r = rnd.random()
  r2 = rnd.randint(0, 2)
  C = r + r2 + 0.000000001 

  return C

Random generator of SVM parameter Gamma

In [None]:
def getGamma():
  r = rnd.random()
  r2 = rnd.randint(0, 3)
  gamma = 0.000000001 + (r / (10**r2))

  return gamma

Random generator of SVM Kernel

In [None]:
def getKernel():
    kernels = ['rbf', 'linear', 'svcLinear']
    index = rnd.randint(0, len(kernels) - 1)   
    kernel = kernels[index]

    return kernel

In [None]:
creator.create('FitnessMax', base.Fitness, weights=(1.0,))
creator.create('Individual', list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register('attribute_bool', rnd.randint, 0, 1)
toolbox.register('attribute_C', getC)
toolbox.register('attribute_Gamma', getGamma)
toolbox.register('attribute_Kernel', getKernel)



In [None]:
function_sequence = [toolbox.attribute_C, toolbox.attribute_Gamma, toolbox.attribute_Kernel]

for column in cols:
  function_sequence.append(toolbox.attribute_bool)

toolbox.register('individual', tools.initCycle, creator.Individual, function_sequence, 1)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)
print('Individual size: ', len(function_sequence))

Individual size:  33


In [None]:
def getModel(individual):
  kernel = individual[2]
  
  if kernel == 'svcLinear':
    clf = LinearSVC(C=individual[0])
  elif kernel == 'rbf':
    clf = SVC(kernel=kernel, C=individual[0], gamma=individual[1])
  else:
    clf = SVC(kernel=kernel, C=individual[0])

  return clf

In [None]:
def getXy(individual):
  scols = list(cols)

  for i in range(len(individual[3:])):
    if individual[3 + i] < 1:
      scols.remove(cols[i])
  
  tcols = np.append(['Survived'], scols)
  df = training.loc[:, tcols].dropna()
  X = df.loc[:, scols]
  scaler = preprocessing.StandardScaler().fit(X)
  X = scaler.transform(X)
  y = np.ravel(df.loc[:,['Survived']])
  return [X, y, scols, scaler]

In [None]:
def evalOneMax(individual):
    clf = getModel(individual)
    Xy = getXy(individual)
    scores = cross_val_score(clf, Xy[0], Xy[1], cv=5, n_jobs=-1)
    res1 = scores.mean(),
    
    return res1

In [None]:
def myMutate(individual, indpb=0.05):
    #C
    if rnd.random() < indpb:
        individual[0] = toolbox.attribute_C()
    #Gamma
    if rnd.random() < indpb:
        individual[1] = toolbox.attribute_Gamma()
    #Kernel
    if rnd.random() < indpb:
        individual[2] = toolbox.attribute_Kernel()
    #features
    for i in range(len(individual[3:])):
        if rnd.random() < indpb:
            individual[3+i] = toolbox.attribute_bool()

In [None]:
toolbox.register('evaluate', evalOneMax)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', myMutate, indpb=0.15)
toolbox.register('select', tools.selTournament, tournsize=3)
rnd.seed(66)

In [None]:
CXPB, MUTPB, NGEN, POPSIZE = 0.5, 0.2, 40, 100
pop = toolbox.population(n=POPSIZE) 

In [None]:
print("Start of evolution SVM")

fitnesses = list(map(toolbox.evaluate, pop))
for ind, fit in zip(pop, fitnesses):
    ind.fitness.values = fit

print("  Evaluated %i individuals" % len(pop))

for g in range(NGEN):
    print("-- Generation %i --" % g)
    
    offspring = toolbox.select(pop, len(pop))
    offspring = list(map(toolbox.clone, offspring))

    for child1, child2 in zip(offspring[::2], offspring[1::2]):

        if rnd.random() < CXPB:
            c1 = toolbox.clone(child1)
            c2 = toolbox.clone(child2)
            toolbox.mate(child1, child2)
            if c1!=child1: del child1.fitness.values
            if c2!=child2: del child2.fitness.values

    for mutant in offspring:

        if rnd.random() < MUTPB:
            m1 = toolbox.clone(mutant)
            toolbox.mutate(mutant)
            if m1!=mutant: del mutant.fitness.values

    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit
    
    print("  Evaluated %i individuals" % len(invalid_ind))
    
    pop[:] = offspring
    
    fits = [ind.fitness.values[0] for ind in pop]
    
    length = len(pop)
    mean = sum(fits) / length
    sum2 = sum(x*x for x in fits)
    std = abs(sum2 / length - mean**2)**0.5
    best_ind = tools.selBest(pop, POPSIZE)[0]
    print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values))        
    print("  Min %s" % min(fits))
    print("  Max %s" % max(fits))
    print("  Avg %s" % mean)
    print("  Std %s" % std)

print("-- End of (successful) evolution --")

best_ind = tools.selBest(pop, POPSIZE)[0]
print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values))

Start of evolution SVM
  Evaluated 100 individuals
-- Generation 0 --
  Evaluated 54 individuals
Best individual is [1.029113730790076, 0.03703743006393346, 'rbf', 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1], (0.8260121775155358,)
  Min 0.6722867365513778
  Max 0.8260121775155358
  Avg 0.7998783503860402
  Std 0.023135989681699165
-- Generation 1 --
  Evaluated 54 individuals
Best individual is [1.657360108916588, 0.02437966485020074, 'rbf', 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1], (0.8271357730211537,)
  Min 0.7520243550310715
  Max 0.8271357730211537
  Avg 0.8115238842508316
  Std 0.013389528272698811
-- Generation 2 --
  Evaluated 50 individuals
Best individual is [0.5880409572959077, 0.0009302427369007562, 'linear', 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1], (0.8305065595380077,)
  Min 0.7340468269411838
  Max 0.8305065595380077
  Avg 0.814

In [None]:
model = getModel(best_ind)
Xy = getXy(best_ind)
colsSVM = Xy[2]
scaler = Xy[3]
print(f'Selected Features: {colsSVM}')

Selected Features: ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'Young_m', 'Alone', 'Ttype2', 'Ticket_group', 'Young', 'AgeBand']


In [None]:
X_train = Xy[0]
Y_train = Xy[1]

scores = cross_val_score(model, X_train, Y_train, cv=5).mean()
cv_SVMGA_score = scores.mean()
print(f'SVMGA CV score: {scores.mean()}')
model.fit( X_train , Y_train )
print(f'Training score: {model.score(X_train, Y_train)}')

SVMGA CV score: 0.8394827694432239
Training score: 0.8383838383838383
