# Classification with standard parameters

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('whitegrid')
from sklearn.model_selection import train_test_split
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "results"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

# Helper functioins and structures
import helpers
# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

## Voting Classifiers

### Load data

In [2]:
DATA_PATH = "Skyserver_SQL2_27_2018 6_51_39 PM.csv"
RESULTS_FOLDER = "results"

# We load the data. Those that have nothing to do with the features of the objects are ignored.
sdss_data=pd.read_csv(DATA_PATH)

#We have a general look at the features
sdss_data.head(3)

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.23765e+18,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,752,301,4,267,3.72236e+18,STAR,-9e-06,3306,54922,491
1,1.23765e+18,183.598371,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,752,301,4,267,3.63814e+17,STAR,-5.5e-05,323,51615,541
2,1.23765e+18,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752,301,4,268,3.23274e+17,GALAXY,0.123111,287,52023,513


### Drop not important features

In [3]:
print(sdss_data.columns.values)
sdss_data.drop(['objid', 'run', 'rerun', 'camcol', 'field', 'specobjid','fiberid','mjd','plate'], axis=1, inplace=True)
sdss_data.head(1)

['objid' 'ra' 'dec' 'u' 'g' 'r' 'i' 'z' 'run' 'rerun' 'camcol' 'field'
 'specobjid' 'class' 'redshift' 'plate' 'mjd' 'fiberid']


Unnamed: 0,ra,dec,u,g,r,i,z,class,redshift
0,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,STAR,-9e-06


### Split into train/test sets

In [4]:
from sklearn.model_selection import train_test_split

X=sdss_data.drop(['class'],axis=1)
y=sdss_data['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Ensable learining with RandForest, LogReg, SVC

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Xgboost parameters
xgb_params = {'learning_rate': 0.05, 
              'max_depth': 4,
              'subsample': 0.9,        
              'colsample_bytree': 0.9,
              'objective': 'binary:logistic',
              'silent': 1, 
              'n_estimators':100, 
              'gamma':1,         
              'min_child_weight':4}   
xgb_clf = XGBClassifier(**xgb_params, seed = 10)
log_clf = LogisticRegression(solver="liblinear", random_state=42, multi_class='auto')
rnd_clf = RandomForestClassifier(n_estimators=3, random_state=42)
svm_clf = SVC(gamma="auto", random_state=42)
knn_clf = KNeighborsClassifier()

voting_clf = VotingClassifier(
    estimators=[
        ('xgb', xgb_clf),
        ('lr', log_clf),
        ('rf', rnd_clf),
        ('svc', svm_clf),
        ('knn', knn_clf)],
    voting='hard')

### Accuracy Scores for voting ensamble classifier (Hard voting)

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

# With scaled sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
X_test_scaled = scaler.fit_transform(X_test.astype(np.float64))

for clf in (xgb_clf, log_clf, rnd_clf, svm_clf, knn_clf, voting_clf):
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

XGBClassifier 0.986
LogisticRegression 0.9432
RandomForestClassifier 0.9772
SVC 0.9464
SGDClassifier 0.9528
KNeighborsClassifier 0.8968
VotingClassifier 0.9636


In [7]:
# Without scalint the sets
for clf in (xgb_clf, log_clf, rnd_clf, svm_clf, knn_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

XGBClassifier 0.9932
LogisticRegression 0.9396
RandomForestClassifier 0.986
SVC 0.8208
SGDClassifier 0.89
KNeighborsClassifier 0.7716
VotingClassifier 0.9632


In [8]:
classifs = ['xgb_clf','log_clf','rnd_clf','svm_clf','sgd_clf','knn_clf','voting_clf']
scores = []

scores.append(cross_val_score(xgb_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))
scores.append(cross_val_score(log_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))
scores.append(cross_val_score(rnd_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))
scores.append(cross_val_score(svm_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))
scores.append(cross_val_score(knn_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))
scores.append(cross_val_score(voting_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))

In [9]:
def print_cv_scores(scores):
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

for item in zip(classifs, scores):
    print(item[0])
    print(item[1])
    print_cv_scores(item[1])
    print("\n")

xgb_clf
[0.98800799 0.98467688 0.99066667 0.98865911 0.993996  ]
Accuracy: 0.989 (+/- 0.006)


log_clf
[0.94470353 0.94603598 0.96066667 0.95263509 0.94796531]
Accuracy: 0.950 (+/- 0.012)


rnd_clf
[0.98001332 0.98134577 0.98066667 0.97865243 0.98532355]
Accuracy: 0.981 (+/- 0.004)


svm_clf
[0.9586942  0.95003331 0.966      0.9646431  0.96197465]
Accuracy: 0.960 (+/- 0.011)


sgd_clf
[0.96469021 0.95936043 0.97333333 0.97398266 0.96664443]
Accuracy: 0.968 (+/- 0.011)


knn_clf
[0.90206529 0.88607595 0.908      0.8945964  0.89793195]
Accuracy: 0.898 (+/- 0.015)


voting_clf
[0.9733511  0.97001999 0.98133333 0.97865243 0.97731821]
Accuracy: 0.976 (+/- 0.008)


