# Optimization of Voting classifier for the SDSS data

## Load data

In [5]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals
import warnings
import helpers
from helpers import DataSet
import matplotlib as mpl
import os
import random

# Common imports
import pandas as pd
import numpy as np
import seaborn as sns

# Imports for ML
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier, RidgeClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import RobustScaler, StandardScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.decomposition import PCA

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
sns.set_style('whitegrid')

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "results"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

# Helper functioins and structures
# Ignore useless warnings (see SciPy issue #5998)
warnings.filterwarnings(action="ignore", message="^internal gelsd")

DATA_PATH = "Skyserver_SQL2_27_2018 6_51_39 PM.csv"
RESULTS_FOLDER = "results"

# We load the data. Those that have nothing to do with the features of the objects are ignored.
sdss_data = pd.read_csv(DATA_PATH)

# We have a general look at the features
sdss_data.head(3)

print(sdss_data.columns.values)
sdss_data.drop(['objid', 'run', 'rerun', 'camcol', 'field',
                'specobjid', 'fiberid', 'mjd', 'plate'], axis=1, inplace=True)
sdss_data.head(1)

sdss_df_fe = sdss_data

# Principal Component Analysis
pca = PCA(n_components=3)
ugriz = pca.fit_transform(sdss_df_fe[['u', 'g', 'r', 'i', 'z']])

# update dataframe
sdss_df_fe = pd.concat((sdss_df_fe, pd.DataFrame(ugriz)), axis=1)
sdss_df_fe.rename({0: 'PCA_1', 1: 'PCA_2', 2: 'PCA_3'}, axis=1, inplace=True)
sdss_df_fe.drop(['u', 'g', 'r', 'i', 'z'], axis=1, inplace=True)
sdss_df_fe.head()

X = sdss_data.drop(['class'], axis=1)
y = sdss_data['class']

class_names = ["GALAXY", "QSO", "STAR"]

std_scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_test = std_scaler.fit_transform(X_test.astype(np.float64))
X_train = std_scaler.fit_transform(X_train.astype(np.float64))

['objid' 'ra' 'dec' 'u' 'g' 'r' 'i' 'z' 'run' 'rerun' 'camcol' 'field'
 'specobjid' 'class' 'redshift' 'plate' 'mjd' 'fiberid']


## Optimize

In [6]:
def prepare_clfs():
    knn_clf = KNeighborsClassifier(n_neighbors=3,
                                  weights='distance', 
                                  algorithm='auto',
                                  leaf_size=23, 
                                  p=2,
                                  metric='manhattan', 
                                  n_jobs=4)

    lsvm_clf = LinearSVC(penalty='l2',
                       loss='squared_hinge',
                       dual=False,
                       C=303919.53823132074,
                       multi_class='ovr',
                       fit_intercept=True,
                       intercept_scaling=1.0,
                       class_weight='balanced',
                       verbose=False,
                       max_iter=10000,
                       random_state=42)
    
    nusvm_clf = NuSVC(nu=0.01743328822199991,
                   kernel="rbf",
                   gamma=0.01743328822199991, 
                   degree=7,
                   coef0=4.5203536563602496e-05, 
                   shrinking=True,
                   probability=True,
                   tol=0.001,
                   cache_size=20, 
                   class_weight="balanced",
                   verbose=False,
                   max_iter=100000,
                   random_state=42)

    rbf_svm_clf = SVC(kernel="rbf",
                      C=18329.8071083243,
                      gamma=0.004281332398719387,
                      degree=8,
                      coef0=0.0007847599703514606,
                      shrinking=True,
                      probability=True,
                      tol=0.001,
                      cache_size=100,
                      class_weight=None,
                      verbose=False,
                      max_iter=-1,
                      random_state=42)

    poly_svm_clf = SVC(kernel='poly',
                 C=788.0462815669937,
                 gamma=0.0004893900918477499, 
                 degree=4,
                 coef0=22.122162910704503, 
                 shrinking=False,
                 probability=True, 
                 tol=0.001, 
                 cache_size=50,
                 class_weight=None, 
                 verbose=False, 
                 max_iter=100000, 
                 random_state=42)

    tree_clf = DecisionTreeClassifier(criterion="entropy",
                                      splitter="best",
                                      max_depth=3,
                                      min_weight_fraction_leaf=0.0014384498882876629,
                                      max_leaf_nodes=9,
                                      presort=True,
                                      min_samples_split=6,
                                      min_samples_leaf=4,
                                      max_features=None,
                                      random_state=42)

    rnd_clf = RandomForestClassifier(n_estimators=100,
                                     criterion="entropy",
                                     min_samples_split=4,
                                     min_samples_leaf=2,
                                     max_features=None,
                                     bootstrap=True,
                                     oob_score=True,
                                     random_state=42)

    log_clf = LogisticRegression(penalty='l1',
                                 dual=False,
                                 C=31.622776601683793,
                                 fit_intercept=True,
                                 intercept_scaling=0.5878016072274912,
                                 class_weight='balanced',
                                 solver='saga',
                                 multi_class='multinomial',
                                 warm_start=False,
                                 n_jobs=4,
                                 max_iter=1000,
                                 random_state=42)

    gb_clf = GradientBoostingClassifier(loss="deviance",
                                        learning_rate=0.18329807108324356,
                                        n_estimators=83,
                                        criterion="friedman_mse",
                                        min_samples_split=3,
                                        min_samples_leaf=7,
                                        max_depth=7,
                                        random_state=42)

    xgb_clf = XGBClassifier(eta=0.01778279410038923,
                            gamma=0.01,
                            max_depth=4,
                            min_child_weight=0.03831186849557289,
                            max_delta_step=0.35938136638046275,
                            subsample=1.0,
                            reg_lambda=5.994842503189409,
                            alpha=1.7782794100389228e-08,
                            colsample_bytree=0.9,
                            objective="multi:softprob",
                            seed=42,
                            n_estimators=4216,
                            nthread=4)

    bag_df_clf = BaggingClassifier(n_estimators=261,
                                   max_samples=46,
                                   max_features=6,
                                   bootstrap=False,
                                   bootstrap_features=False,
                                   oob_score=False,
                                   n_jobs=4,
                                   random_state=42)

    ada_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                                 n_estimators=825,
                                 learning_rate=0.19573417814876617,
                                 algorithm='SAMME',
                                 random_state=42)

    sgd_clf = SGDClassifier(loss='perceptron',
                            penalty='l1',
                            alpha=0.00510896977450693,
                            l1_ratio=0.014677992676220698,
                            fit_intercept=True,
                            shuffle=True,
                            epsilon=0.4216965034285822,
                            n_jobs=4,
                            random_state=42,
                            learning_rate='optimal',
                            eta0=0.2,
                            power_t=0.014677992676220698,
                            early_stopping=True,
                            n_iter_no_change=5,
                            class_weight=None,
                            average=False)

    mlp_clf = MLPClassifier(hidden_layer_sizes=(20, 20),
                           activation='relu',
                           solver='sgd', 
                           alpha=1e-05, 
                           batch_size=3, 
                           learning_rate='adaptive',
                           learning_rate_init=0.1333521432163324, 
                           power_t=0.004216965034285823, 
                           max_iter=1000,
                           random_state=42,
                           momentum=0.1,
                           nesterovs_momentum=True, 
                           early_stopping=True,
                           beta_1=0.7, 
                           beta_2=0.4210900698456838, 
                           epsilon=1e-08,
                           n_iter_no_change=5)

    xtree_clf = ExtraTreesClassifier(n_estimators=2154,
                                     criterion="gini",
                                     min_samples_split=2,
                                     min_samples_leaf=1,
                                     max_features=None,
                                     bootstrap=False,
                                     oob_score=False,
                                     random_state=42)

    pa_clf = PassiveAggressiveClassifier(loss='hinge',
                                         shuffle=False,
                                         n_jobs=4,
                                         random_state=42,
                                         early_stopping=True,
                                         warm_start=False,
                                         class_weight='balanced',
                                         average=False)

    r_clf = RidgeClassifier(alpha=0.11006941712522103,
                            fit_intercept=False,
                            normalize=False,
                            copy_X=True,
                            max_iter=3000,
                            class_weight=None,
                            solver='cholesky',
                            random_state=42)

    nb_clf = GaussianNB(var_smoothing=3.727593720314938e-11)

    lda_clf = LinearDiscriminantAnalysis(
        solver="svd", shrinkage=None, n_components=None, store_covariance=False)

    qda_clf = QuadraticDiscriminantAnalysis(
        reg_param=2.1544346900318867e-07, store_covariance=False)

    clf_names = ["KNN",
                 "LinearSVM",
                 "NuSVC",
                 "RbfKernelSVM",
                 "PolyKernelSVM",
                 "DecisionTreeClassifier",
                 "RandomForestClassifier",
                 "LogisticRegression",
                 "GradientBoostingClassifier",
                 "XGBClassifier",
                 "BaggingClassifier",
                 "AdaBoostClassifier",
                 "SGDClassifier",
                 "MLPClassifier",
                 "ExtraTreesClassifier",
                 "PassiveAggressiveClassifier",
                 "RidgeClassifier",
                 "GaussianNB",
                 "LinearDiscriminantAnalysis",
                 "QuadraticDiscriminantAnalysis"
                 ]

    clfs = [knn_clf,
            lsvm_clf,
            nusvm_clf,
            rbf_svm_clf,
            poly_svm_clf,
            tree_clf,
            rnd_clf,
            log_clf,
            gb_clf,
            xgb_clf,
            bag_df_clf,
            ada_clf,
            sgd_clf,
            mlp_clf,
            xtree_clf,
            pa_clf,
            r_clf,
            nb_clf,
            lda_clf,
            qda_clf
            ]

    return list(zip(clf_names, clfs))

In [7]:
def get_random_clf_sets(clfs, num=5):
    clf_sets = []

    for i in range(0, num):
        clf_num = random.randint(3, len(clfs))
        clf_list = random.sample(clfs, clf_num)
        clf_sets.append(clf_list)
    
    return clf_sets

all_clfs = prepare_clfs()
clf_choices = get_random_clf_sets(all_clfs, 300)

In [8]:
def print_choices(clf_choices):
    i = 0
    for clf_tuple in clf_choices:
        print("=======Choice " + str(i) + "=======")
        clfnames = []
        for clf in clf_tuple:
            clfnames.append(clf[0])
        i = i + 1
        print(clfnames)
        
def print_choices_single(choice, i=None):
    if (i is None): 
        print("=======Choice=======")
    else:
        print("=======Choice " + str(i) + "=======")
    clfnames = []
    for clf in choice:
        clfnames.append(clf[0])
    print(clfnames)
    return str(clfnames)

    
print_choices(clf_choices)

['LinearDiscriminantAnalysis', 'AdaBoostClassifier', 'RbfKernelSVM', 'RidgeClassifier', 'SGDClassifier', 'RandomForestClassifier', 'BaggingClassifier', 'PassiveAggressiveClassifier', 'XGBClassifier', 'MLPClassifier', 'KNN', 'LogisticRegression', 'GradientBoostingClassifier', 'LinearSVM', 'NuSVC', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'PolyKernelSVM']
['RbfKernelSVM', 'NuSVC', 'PolyKernelSVM', 'KNN', 'GaussianNB', 'DecisionTreeClassifier', 'LinearDiscriminantAnalysis', 'PassiveAggressiveClassifier', 'RidgeClassifier', 'AdaBoostClassifier', 'XGBClassifier', 'LinearSVM', 'RandomForestClassifier', 'LogisticRegression', 'GradientBoostingClassifier', 'ExtraTreesClassifier', 'MLPClassifier', 'QuadraticDiscriminantAnalysis', 'BaggingClassifier']
['PassiveAggressiveClassifier', 'LinearSVM', 'GradientBoostingClassifier', 'MLPClassifier', 'RbfKernelSVM', 'LogisticRegression', 'RidgeClassifier', 'GaussianNB', 'SGDClassifier']
['LinearDiscriminantAnalysis', 'RbfKernelSVM', 'DecisionTree

In [9]:
def voting_grid():
    result_sets = []
    i = 0
    for clf_choice in clf_choices:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            voting_clf = VotingClassifier(estimators=clf_choice, n_jobs=8)
            print_choices_single(clf_choice, i)
            helpers.train_classif_single(voting_clf, clf_choice[0], class_names, X_train, y_train, X_test, y_test, result_sets)
            i = i + 1

In [None]:
voting_grid()

['LinearDiscriminantAnalysis', 'AdaBoostClassifier', 'RbfKernelSVM', 'RidgeClassifier', 'SGDClassifier', 'RandomForestClassifier', 'BaggingClassifier', 'PassiveAggressiveClassifier', 'XGBClassifier', 'MLPClassifier', 'KNN', 'LogisticRegression', 'GradientBoostingClassifier', 'LinearSVM', 'NuSVC', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'PolyKernelSVM']
[0.98734177 0.98467688 0.99333333 0.98999333 0.99066044]
Accuracy: 0.989 (+/- 0.006)
('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)) :
 [[1228    4   14]
 [  12  230    0]
 [   0    0 1012]]
              precision    recall  f1-score   support

      GALAXY       0.99      0.99      0.99      1246
         QSO       0.98      0.95      0.97       242
        STAR       0.99      1.00      0.99      1012

   micro avg       0.99      0.99      0.99      2500
   macro avg       0.99      0.98      0.98      

In [None]:
best_indiv = VotingClassifier() # todo

clf_names = ["ExtraTreesClassifier"]
class_names = ["GALAXY", "QSO", "STAR"]

clfs = [best_indiv]
data_sets = []
std_scaled_set = DataSet("Standard Scaled", X_train, y_train, X_test, y_test)
data_sets.append(std_scaled_set)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    helpers.learning_loop_for_sets(clfs, clf_names, class_names, data_sets)