# Genetic optimisation of SVC classifier for the SDSS data

## Load data

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals
import warnings
import helpers
from helpers import DataSet
import matplotlib as mpl
import os
import random

# Common imports
import pandas as pd
import numpy as np
import seaborn as sns

# Imports for ML
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import RobustScaler, StandardScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.decomposition import PCA

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
sns.set_style('whitegrid')

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "results"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

# Helper functioins and structures
# Ignore useless warnings (see SciPy issue #5998)
warnings.filterwarnings(action="ignore", message="^internal gelsd")

DATA_PATH = "Skyserver_SQL2_27_2018 6_51_39 PM.csv"
RESULTS_FOLDER = "results"

# We load the data. Those that have nothing to do with the features of the objects are ignored.
sdss_data = pd.read_csv(DATA_PATH)

# We have a general look at the features
sdss_data.head(3)

print(sdss_data.columns.values)
sdss_data.drop(['objid', 'run', 'rerun', 'camcol', 'field',
                'specobjid', 'fiberid', 'mjd', 'plate'], axis=1, inplace=True)
sdss_data.head(1)

sdss_df_fe = sdss_data

# Principal Component Analysis
pca = PCA(n_components=3)
ugriz = pca.fit_transform(sdss_df_fe[['u', 'g', 'r', 'i', 'z']])

# update dataframe 
sdss_df_fe = pd.concat((sdss_df_fe, pd.DataFrame(ugriz)), axis=1)
sdss_df_fe.rename({0: 'PCA_1', 1: 'PCA_2', 2: 'PCA_3'}, axis=1, inplace = True)
sdss_df_fe.drop(['u', 'g', 'r', 'i', 'z'], axis=1, inplace=True)
sdss_df_fe.head()

X = sdss_data.drop(['class'], axis=1)
y = sdss_data['class']

class_names = ["GALAXY", "QSO", "STAR"]

std_scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_test = std_scaler.fit_transform(X_test.astype(np.float64))
X_train = std_scaler.fit_transform(X_train.astype(np.float64))

['objid' 'ra' 'dec' 'u' 'g' 'r' 'i' 'z' 'run' 'rerun' 'camcol' 'field'
 'specobjid' 'class' 'redshift' 'plate' 'mjd' 'fiberid']


## Optimize

In [2]:
from evolutionary_search import EvolutionaryAlgorithmSearchCV

paramgrid = {
    "loss": ["deviance", "exponential"],
    "learning_rate": np.logspace(-2, 1, num=20, base=10),
    "n_estimators": np.logspace(1, 4, num=20, base=10, dtype=int),
    "criterion": ["friedman_mse", "mse", "mae"],
    "min_samples_split": np.arange(2, 10),
    "min_samples_leaf": np.arange(1, 10),
    "max_depth": np.arange(3, 9),
    "random_state": [42],
}

random.seed(1)

cv = EvolutionaryAlgorithmSearchCV(estimator=GradientBoostingClassifier(),
                                   params=paramgrid,
                                   scoring="accuracy",
                                   cv=StratifiedKFold(n_splits=5),
                                   verbose=3,
                                   population_size=50,
                                   gene_mutation_prob=0.10,
                                   gene_crossover_prob=0.5,
                                   tournament_size=3,
                                   generations_number=15,
                                   n_jobs=16,
                                   error_score=0.0)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    result = cv.fit(X_train, y_train)

Types [1, 2, 1, 1, 1, 1, 1, 1] and maxint [1, 19, 19, 2, 7, 8, 5, 0] detected
--- Evolve in 1036800 possible combinations ---
[CV] loss=deviance, learning_rate=6.951927961775605, n_estimators=20, criterion=mse, min_samples_split=3, min_samples_leaf=8, max_depth=6, random_state=42 
[CV] loss=exponential, learning_rate=0.08858667904100823, n_estimators=29, criterion=mse, min_samples_split=2, min_samples_leaf=7, max_depth=6, random_state=42 
[CV] loss=deviance, learning_rate=1.623776739188721, n_estimators=2335, criterion=mae, min_samples_split=5, min_samples_leaf=6, max_depth=4, random_state=42 
[CV] loss=exponential, learning_rate=0.18329807108324356, n_estimators=127, criterion=mae, min_samples_split=3, min_samples_leaf=6, max_depth=3, random_state=42 
[CV] loss=deviance, learning_rate=4.832930238571752, n_estimators=10, criterion=mse, min_samples_split=5, min_samples_leaf=7, max_depth=8, random_state=42 
[CV] loss=exponential, learning_rate=6.951927961775605, n_estimators=14, criterio

[CV] loss=exponential, learning_rate=3.359818286283781, n_estimators=42, criterion=mae, min_samples_split=5, min_samples_leaf=7, max_depth=3, random_state=42 
[CV] loss=exponential, learning_rate=3.359818286283781, n_estimators=545, criterion=mae, min_samples_split=7, min_samples_leaf=8, max_depth=5, random_state=42 
[CV]  loss=exponential, learning_rate=0.5455594781168517, n_estimators=2335, criterion=mae, min_samples_split=2, min_samples_leaf=8, max_depth=3, random_state=42, score=0.0, total=   0.0s
[CV] loss=exponential, learning_rate=0.5455594781168517, n_estimators=2335, criterion=mae, min_samples_split=2, min_samples_leaf=8, max_depth=3, random_state=42 
[CV] loss=exponential, learning_rate=0.18329807108324356, n_estimators=127, criterion=mae, min_samples_split=3, min_samples_leaf=6, max_depth=3, random_state=42 
[CV] loss=exponential, learning_rate=6.951927961775605, n_estimators=4832, criterion=friedman_mse, min_samples_split=8, min_samples_leaf=8, max_depth=5, random_state=42 

[CV] loss=deviance, learning_rate=0.18329807108324356, n_estimators=3359, criterion=friedman_mse, min_samples_split=6, min_samples_leaf=5, max_depth=6, random_state=42 
[CV] loss=exponential, learning_rate=0.26366508987303583, n_estimators=10, criterion=mse, min_samples_split=3, min_samples_leaf=3, max_depth=8, random_state=42 
[CV]  loss=exponential, learning_rate=4.832930238571752, n_estimators=545, criterion=friedman_mse, min_samples_split=9, min_samples_leaf=9, max_depth=3, random_state=42, score=0.0, total=   0.0s
[CV]  loss=exponential, learning_rate=6.951927961775605, n_estimators=4832, criterion=friedman_mse, min_samples_split=8, min_samples_leaf=8, max_depth=5, random_state=42, score=0.0, total=   0.0s
[CV]  loss=exponential, learning_rate=0.26366508987303583, n_estimators=10, criterion=mse, min_samples_split=3, min_samples_leaf=3, max_depth=8, random_state=42, score=0.0, total=   0.0s
[CV] loss=exponential, learning_rate=4.832930238571752, n_estimators=545, criterion=friedman

[CV]  loss=exponential, learning_rate=0.029763514416313176, n_estimators=10000, criterion=friedman_mse, min_samples_split=7, min_samples_leaf=5, max_depth=3, random_state=42, score=0.0, total=   0.0s
[CV] loss=exponential, learning_rate=2.3357214690901213, n_estimators=29, criterion=friedman_mse, min_samples_split=6, min_samples_leaf=7, max_depth=5, random_state=42 
[CV]  loss=exponential, learning_rate=0.08858667904100823, n_estimators=545, criterion=friedman_mse, min_samples_split=5, min_samples_leaf=7, max_depth=7, random_state=42, score=0.0, total=   0.0s
[CV]  loss=exponential, learning_rate=2.3357214690901213, n_estimators=29, criterion=friedman_mse, min_samples_split=6, min_samples_leaf=7, max_depth=5, random_state=42, score=0.0, total=   0.0s
[CV]  loss=exponential, learning_rate=0.01438449888287663, n_estimators=263, criterion=friedman_mse, min_samples_split=5, min_samples_leaf=1, max_depth=5, random_state=42, score=0.0, total=   0.0s
[CV] loss=exponential, learning_rate=0.088

[CV]  loss=exponential, learning_rate=0.37926901907322497, n_estimators=10000, criterion=mae, min_samples_split=6, min_samples_leaf=6, max_depth=5, random_state=42, score=0.0, total=   0.0s
[CV] loss=exponential, learning_rate=0.37926901907322497, n_estimators=10000, criterion=mae, min_samples_split=6, min_samples_leaf=6, max_depth=5, random_state=42 
[CV]  loss=exponential, learning_rate=0.37926901907322497, n_estimators=10000, criterion=mae, min_samples_split=6, min_samples_leaf=6, max_depth=5, random_state=42, score=0.0, total=   0.0s
[CV] loss=exponential, learning_rate=0.37926901907322497, n_estimators=10000, criterion=mae, min_samples_split=6, min_samples_leaf=6, max_depth=5, random_state=42 
[CV]  loss=exponential, learning_rate=0.37926901907322497, n_estimators=10000, criterion=mae, min_samples_split=6, min_samples_leaf=6, max_depth=5, random_state=42, score=0.0, total=   0.0s
[CV]  loss=deviance, learning_rate=4.832930238571752, n_estimators=10, criterion=mse, min_samples_spli

[CV] loss=deviance, learning_rate=0.01438449888287663, n_estimators=20, criterion=friedman_mse, min_samples_split=4, min_samples_leaf=3, max_depth=7, random_state=42 
[CV] loss=deviance, learning_rate=0.7847599703514611, n_estimators=42, criterion=friedman_mse, min_samples_split=4, min_samples_leaf=8, max_depth=8, random_state=42 
[CV]  loss=deviance, learning_rate=0.0206913808111479, n_estimators=20, criterion=friedman_mse, min_samples_split=9, min_samples_leaf=1, max_depth=5, random_state=42, score=0.9906666666666667, total=   0.9s
[CV] loss=deviance, learning_rate=0.0206913808111479, n_estimators=20, criterion=friedman_mse, min_samples_split=9, min_samples_leaf=1, max_depth=5, random_state=42 
[CV]  loss=deviance, learning_rate=6.951927961775605, n_estimators=20, criterion=mse, min_samples_split=3, min_samples_leaf=8, max_depth=6, random_state=42, score=0.596, total=   1.2s
[CV] loss=deviance, learning_rate=6.951927961775605, n_estimators=20, criterion=mse, min_samples_split=3, min_

[CV]  loss=exponential, learning_rate=2.3357214690901213, n_estimators=379, criterion=friedman_mse, min_samples_split=5, min_samples_leaf=6, max_depth=3, random_state=42, score=0.0, total=   0.0s
[CV] loss=exponential, learning_rate=2.3357214690901213, n_estimators=379, criterion=friedman_mse, min_samples_split=5, min_samples_leaf=6, max_depth=3, random_state=42 
[CV]  loss=exponential, learning_rate=2.3357214690901213, n_estimators=379, criterion=friedman_mse, min_samples_split=5, min_samples_leaf=6, max_depth=3, random_state=42, score=0.0, total=   0.0s
[CV] loss=deviance, learning_rate=0.26366508987303583, n_estimators=10000, criterion=mse, min_samples_split=9, min_samples_leaf=7, max_depth=5, random_state=42 
[CV]  loss=deviance, learning_rate=0.18329807108324356, n_estimators=29, criterion=mse, min_samples_split=7, min_samples_leaf=9, max_depth=6, random_state=42, score=0.996, total=   1.6s
[CV] loss=deviance, learning_rate=0.18329807108324356, n_estimators=29, criterion=mse, min_

[CV] loss=deviance, learning_rate=0.7847599703514611, n_estimators=263, criterion=friedman_mse, min_samples_split=7, min_samples_leaf=3, max_depth=5, random_state=42 
[CV]  loss=deviance, learning_rate=0.7847599703514611, n_estimators=263, criterion=friedman_mse, min_samples_split=7, min_samples_leaf=3, max_depth=5, random_state=42, score=0.06270847231487658, total=   8.9s
[CV] loss=deviance, learning_rate=0.7847599703514611, n_estimators=20, criterion=mse, min_samples_split=3, min_samples_leaf=2, max_depth=3, random_state=42 
[CV]  loss=deviance, learning_rate=0.7847599703514611, n_estimators=20, criterion=mse, min_samples_split=3, min_samples_leaf=2, max_depth=3, random_state=42, score=0.9813457694870087, total=   0.8s
[CV] loss=deviance, learning_rate=0.7847599703514611, n_estimators=20, criterion=mse, min_samples_split=3, min_samples_leaf=2, max_depth=3, random_state=42 
[CV]  loss=deviance, learning_rate=0.7847599703514611, n_estimators=20, criterion=mse, min_samples_split=3, min_

[CV] loss=deviance, learning_rate=0.01, n_estimators=4832, criterion=friedman_mse, min_samples_split=5, min_samples_leaf=8, max_depth=4, random_state=42 
[CV]  loss=deviance, learning_rate=0.18329807108324356, n_estimators=3359, criterion=friedman_mse, min_samples_split=6, min_samples_leaf=5, max_depth=6, random_state=42, score=0.9879919946631087, total=  34.7s
[CV] loss=deviance, learning_rate=0.18329807108324356, n_estimators=3359, criterion=friedman_mse, min_samples_split=6, min_samples_leaf=5, max_depth=6, random_state=42 
[CV]  loss=deviance, learning_rate=3.359818286283781, n_estimators=1623, criterion=friedman_mse, min_samples_split=2, min_samples_leaf=7, max_depth=8, random_state=42, score=0.6402398401065956, total= 2.0min
[CV] loss=deviance, learning_rate=3.359818286283781, n_estimators=1623, criterion=friedman_mse, min_samples_split=2, min_samples_leaf=7, max_depth=8, random_state=42 
[CV]  loss=deviance, learning_rate=0.26366508987303583, n_estimators=10, criterion=mae, min_

[CV] loss=deviance, learning_rate=0.12742749857031335, n_estimators=61, criterion=mae, min_samples_split=3, min_samples_leaf=7, max_depth=6, random_state=42 
[CV]  loss=deviance, learning_rate=0.01, n_estimators=4832, criterion=friedman_mse, min_samples_split=5, min_samples_leaf=8, max_depth=4, random_state=42, score=0.9926617745163442, total= 1.7min
[CV]  loss=deviance, learning_rate=3.359818286283781, n_estimators=1623, criterion=friedman_mse, min_samples_split=2, min_samples_leaf=7, max_depth=8, random_state=42, score=0.5156771180787192, total= 1.7min
[CV]  loss=deviance, learning_rate=3.359818286283781, n_estimators=10000, criterion=mse, min_samples_split=4, min_samples_leaf=4, max_depth=4, random_state=42, score=0.9820119920053297, total= 3.4min
[CV] loss=deviance, learning_rate=3.359818286283781, n_estimators=10000, criterion=mse, min_samples_split=4, min_samples_leaf=4, max_depth=4, random_state=42 
[CV]  loss=deviance, learning_rate=10.0, n_estimators=6951, criterion=mse, min_s

[CV]  loss=deviance, learning_rate=0.26366508987303583, n_estimators=263, criterion=mae, min_samples_split=4, min_samples_leaf=7, max_depth=7, random_state=42, score=0.983344437041972, total=17.6min
[CV] loss=deviance, learning_rate=0.26366508987303583, n_estimators=263, criterion=mae, min_samples_split=4, min_samples_leaf=7, max_depth=7, random_state=42 
[CV]  loss=deviance, learning_rate=0.26366508987303583, n_estimators=127, criterion=mae, min_samples_split=9, min_samples_leaf=3, max_depth=7, random_state=42, score=0.9859906604402935, total= 8.6min
[CV] loss=deviance, learning_rate=0.26366508987303583, n_estimators=127, criterion=mae, min_samples_split=9, min_samples_leaf=3, max_depth=7, random_state=42 
[CV]  loss=deviance, learning_rate=0.26366508987303583, n_estimators=127, criterion=mae, min_samples_split=9, min_samples_leaf=3, max_depth=7, random_state=42, score=0.989993328885924, total= 8.7min
[CV]  loss=deviance, learning_rate=0.0206913808111479, n_estimators=379, criterion=m

Process ForkPoolWorker-15:
Process ForkPoolWorker-14:
Process ForkPoolWorker-11:
Process ForkPoolWorker-10:
Process ForkPoolWorker-6:
Process ForkPoolWorker-13:
Process ForkPoolWorker-12:
Process ForkPoolWorker-7:
Process ForkPoolWorker-8:
Process ForkPoolWorker-9:
Process ForkPoolWorker-3:
Process ForkPoolWorker-4:
Process ForkPoolWorker-2:
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run

KeyboardInterrupt
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    re

KeyboardInterrupt: 

Process ForkPoolWorker-16:
Traceback (most recent call last):
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/site-packages/evolutionary_search/cv.py", line 104, in _evalFunction
    error_score=error_score)[0]
  File "/home/ljosfer/anaconda3/envs/deep/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 528, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ljosfer/anaconda3/envs/deep/lib/py