# Classification

In [2]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('whitegrid')
from sklearn.model_selection import train_test_split
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "results"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

# Helper functioins and structures
import helpers
# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

## Voting Classifiers

### Load data

In [3]:
DATA_PATH = "Skyserver_SQL2_27_2018 6_51_39 PM.csv"
RESULTS_FOLDER = "results"

# We load the data. Those that have nothing to do with the features of the objects are ignored.
sdss_data=pd.read_csv(DATA_PATH)

#We have a general look at the features
sdss_data.head(3)

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.23765e+18,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,752,301,4,267,3.72236e+18,STAR,-9e-06,3306,54922,491
1,1.23765e+18,183.598371,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,752,301,4,267,3.63814e+17,STAR,-5.5e-05,323,51615,541
2,1.23765e+18,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752,301,4,268,3.23274e+17,GALAXY,0.123111,287,52023,513


### Drop not important features

In [4]:
print(sdss_data.columns.values)
sdss_data.drop(['objid', 'run', 'rerun', 'camcol', 'field', 'specobjid','fiberid','mjd','plate'], axis=1, inplace=True)
sdss_data.head(1)

['objid' 'ra' 'dec' 'u' 'g' 'r' 'i' 'z' 'run' 'rerun' 'camcol' 'field'
 'specobjid' 'class' 'redshift' 'plate' 'mjd' 'fiberid']


Unnamed: 0,ra,dec,u,g,r,i,z,class,redshift
0,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,STAR,-9e-06


### Split into train/test sets

In [13]:
from sklearn.model_selection import train_test_split

X=sdss_data.drop(['class'],axis=1)
y=sdss_data['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Ensable learining with RandForest, LogReg, SVC

In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

log_clf = LogisticRegression(solver="liblinear", random_state=42, multi_class='auto')
rnd_clf = RandomForestClassifier(n_estimators=3, random_state=42)
svm_clf = SVC(gamma="auto", random_state=42)
sgd_clf = SGDClassifier(max_iter=10000, tol=-np.infty, random_state=42)
knn_clf = KNeighborsClassifier()

voting_clf = VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('rf', rnd_clf),
        ('svc', svm_clf),
        ('sgd', sgd_clf), 
        ('knn', knn_clf)],
    voting='hard')

### Accuracy Scores for voting ensamble classifier (Hard voting)

In [51]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, sgd_clf, knn_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.9396
RandomForestClassifier 0.986
SVC 0.8208
SGDClassifier 0.89
KNeighborsClassifier 0.7716
VotingClassifier 0.94


In [67]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))

classifs = ['log_clf','rnd_clf','svm_clf','sgd_clf','knn_clf','voting_clf']
scores = []

scores.append(cross_val_score(log_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))
scores.append(cross_val_score(rnd_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))
scores.append(cross_val_score(svm_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))
scores.append(cross_val_score(sgd_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))
scores.append(cross_val_score(knn_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))
scores.append(cross_val_score(voting_clf, X_train_scaled, y_train, cv=5, scoring="accuracy"))

def print_cv_scores(scores):
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

for item in zip(classifs, scores):
    print(item[0])
    print(item[1])
    print_cv_scores(item[1])
    print("\n")

log_clf
[0.94470353 0.94603598 0.96066667 0.95263509 0.94796531]
Accuracy: 0.95 (+/- 0.01)


rnd_clf
[0.98667555 0.98334444 0.98866667 0.98532355 0.99066044]
Accuracy: 0.99 (+/- 0.01)


svm_clf
[0.9586942  0.95003331 0.966      0.9646431  0.96197465]
Accuracy: 0.96 (+/- 0.01)


sgd_clf
[0.96469021 0.95936043 0.97333333 0.97398266 0.96664443]
Accuracy: 0.97 (+/- 0.01)


knn_clf
[0.90206529 0.88607595 0.908      0.8945964  0.89793195]
Accuracy: 0.90 (+/- 0.01)


voting_clf
[0.97534977 0.97201865 0.98533333 0.97931955 0.97731821]
Accuracy: 0.98 (+/- 0.01)




### Accuracy Scores for voting ensamble classifier (Soft voting)

In [65]:
log_clf = LogisticRegression(solver="liblinear", random_state=42,multi_class='auto')
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma="auto", probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=42, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)), ('rf', Rando...bf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

In [66]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.9396
RandomForestClassifier 0.992
SVC 0.8208
VotingClassifier 0.974


## Bagging ensembles

In [24]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9852


In [26]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_tree))

0.9824
