## Supervised Machine Learning - Support Vector Machines (Star vs. QSO)

### Dependencies:

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from sqlalchemy import extract

import sqlite3

In [2]:
# Create db connection to sqlite3 and cursor to execute queries

conn = sqlite3.connect("stellar_class_db")
c = conn.cursor()

In [3]:
# Load the data from sqlite
data_df = pd.read_sql_query("SELECT * FROM quasars_stars", conn)
data_df

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,mjd,fiber_ID
0,1.237680e+18,340.995121,20.589476,23.48827,23.33776,21.32195,20.25615,19.54544,8102,301,3,110,5.658977e+18,QSO,1.424659,5026,55855,741
1,1.237679e+18,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,7773,301,2,462,1.246262e+19,QSO,0.586455,11069,58456,113
2,1.237671e+18,39.149691,28.102842,21.74669,20.03493,19.17553,18.81823,18.65422,5934,301,4,122,2.751763e+18,STAR,-0.000008,2444,54082,232
3,1.237681e+18,345.801874,32.672868,23.17274,20.14496,19.41948,19.22034,18.89359,8157,301,2,38,7.323011e+18,STAR,0.000072,6504,56540,574
4,1.237679e+18,353.201522,3.080796,24.54890,21.44267,20.95315,20.79360,20.48442,7712,301,5,284,4.822278e+18,STAR,-0.000429,4283,55864,178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40550,1.237655e+18,134.347759,47.771911,24.17897,22.89025,21.26451,20.94941,19.87500,2243,301,4,128,8.458979e+18,QSO,0.219966,7513,56780,339
40551,1.237662e+18,222.761686,32.203212,20.23421,19.76480,19.46940,19.36135,19.21768,3900,301,1,583,4.356138e+18,QSO,0.398574,3869,55273,112
40552,1.237656e+18,259.504325,31.462416,24.86685,23.22772,21.82982,21.80359,21.56733,2335,301,5,107,1.312146e+19,QSO,1.239638,11654,58543,812
40553,1.237661e+18,217.958430,52.316738,24.15617,22.05986,21.80826,21.95129,21.24179,3705,301,2,124,7.916218e+18,QSO,1.067543,7031,56449,58


In [4]:
data_df.set_index(['spec_obj_ID'], inplace = True)

In [5]:
data_df

Unnamed: 0_level_0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,class,redshift,plate,mjd,fiber_ID
spec_obj_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5.658977e+18,1.237680e+18,340.995121,20.589476,23.48827,23.33776,21.32195,20.25615,19.54544,8102,301,3,110,QSO,1.424659,5026,55855,741
1.246262e+19,1.237679e+18,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,7773,301,2,462,QSO,0.586455,11069,58456,113
2.751763e+18,1.237671e+18,39.149691,28.102842,21.74669,20.03493,19.17553,18.81823,18.65422,5934,301,4,122,STAR,-0.000008,2444,54082,232
7.323011e+18,1.237681e+18,345.801874,32.672868,23.17274,20.14496,19.41948,19.22034,18.89359,8157,301,2,38,STAR,0.000072,6504,56540,574
4.822278e+18,1.237679e+18,353.201522,3.080796,24.54890,21.44267,20.95315,20.79360,20.48442,7712,301,5,284,STAR,-0.000429,4283,55864,178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8.458979e+18,1.237655e+18,134.347759,47.771911,24.17897,22.89025,21.26451,20.94941,19.87500,2243,301,4,128,QSO,0.219966,7513,56780,339
4.356138e+18,1.237662e+18,222.761686,32.203212,20.23421,19.76480,19.46940,19.36135,19.21768,3900,301,1,583,QSO,0.398574,3869,55273,112
1.312146e+19,1.237656e+18,259.504325,31.462416,24.86685,23.22772,21.82982,21.80359,21.56733,2335,301,5,107,QSO,1.239638,11654,58543,812
7.916218e+18,1.237661e+18,217.958430,52.316738,24.15617,22.05986,21.80826,21.95129,21.24179,3705,301,2,124,QSO,1.067543,7031,56449,58


In [6]:
 new_data_df=data_df.drop(['alpha','delta','run_ID','rerun_ID', 'cam_col', 'field_ID', "obj_ID",'plate', 'mjd', 'fiber_ID'], axis = 1 )

In [7]:
new_data_df

Unnamed: 0_level_0,u,g,r,i,z,class,redshift
spec_obj_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5.658977e+18,23.48827,23.33776,21.32195,20.25615,19.54544,QSO,1.424659
1.246262e+19,21.46973,21.17624,20.92829,20.60826,20.42573,QSO,0.586455
2.751763e+18,21.74669,20.03493,19.17553,18.81823,18.65422,STAR,-0.000008
7.323011e+18,23.17274,20.14496,19.41948,19.22034,18.89359,STAR,0.000072
4.822278e+18,24.54890,21.44267,20.95315,20.79360,20.48442,STAR,-0.000429
...,...,...,...,...,...,...,...
8.458979e+18,24.17897,22.89025,21.26451,20.94941,19.87500,QSO,0.219966
4.356138e+18,20.23421,19.76480,19.46940,19.36135,19.21768,QSO,0.398574
1.312146e+19,24.86685,23.22772,21.82982,21.80359,21.56733,QSO,1.239638
7.916218e+18,24.15617,22.05986,21.80826,21.95129,21.24179,QSO,1.067543


### Variable engineering

In [8]:
frequency = [1, 2, 3]

u_size_bins = [-10000, -6600,-3310, 33]
new_data_df["binned_u"] = pd.cut(new_data_df["u"], u_size_bins, labels = frequency)

g_size_bins = [-10000, -6600,-3310, 33]
new_data_df["binned_g"] = pd.cut(new_data_df["g"], g_size_bins, labels = frequency)

r_size_bins = [9,16,23, 30]
new_data_df["binned_r"] = pd.cut(new_data_df["r"], r_size_bins, labels = frequency)

i_size_bins = [9,17,25, 33]
new_data_df["binned_i"] = pd.cut(new_data_df["i"], i_size_bins, labels = frequency)

z_size_bins = [-10000, -6600,-3310, 30]
new_data_df["binned_z"] = pd.cut(new_data_df["z"], z_size_bins, labels = frequency)

new_data_df.sample(30)

Unnamed: 0_level_0,u,g,r,i,z,class,redshift,binned_u,binned_g,binned_r,binned_i,binned_z
spec_obj_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
9.275216e+18,22.22902,21.5462,21.4015,21.06582,20.80958,QSO,1.678098,3,3,2,2,3
8.839637e+18,24.0829,22.48999,21.78066,21.58501,21.40499,STAR,-0.000274,3,3,2,2,3
1.46713e+18,19.37807,19.05975,19.06641,18.95803,19.10008,QSO,0.653966,3,3,2,2,3
3.090646e+18,20.25166,18.45223,17.47678,17.1051,16.75995,STAR,-0.003555,3,3,2,2,3
6.510056e+18,22.33583,21.69463,21.52573,21.43511,21.76298,STAR,0.000406,3,3,2,2,3
9.836065e+18,21.99281,21.40838,21.15975,20.93182,20.92116,QSO,2.098777,3,3,2,2,3
2.588575e+18,16.89916,15.75342,15.41821,15.30415,15.2818,STAR,-6.8e-05,3,3,1,1,3
2.766362e+18,18.49339,17.00476,16.47646,16.29245,16.20384,STAR,6.4e-05,3,3,2,1,3
6.676752e+17,24.06136,21.08086,20.82272,20.16796,19.93058,QSO,1.617416,3,3,2,2,3
2.157388e+18,20.37372,18.46994,17.66082,17.37993,17.22625,STAR,6.2e-05,3,3,2,2,3


### Test and Train Data

In [9]:
# Create our features
X = new_data_df.drop('class', axis=1)

# Create our target
y = new_data_df['class']

print(X.shape)
print(y.shape)

(40555, 11)
(40555,)


In [10]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
Counter(y_train)

Counter({'QSO': 14221, 'STAR': 16195})

### Random Oversampling

In [11]:
# Resample the training data with the RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'QSO': 16195, 'STAR': 16195})

### Support Vector Machine (Star vs. QSO) - Oversampling

In [12]:
# Train the Support Vector Machine Model using the resampled data
model = SVC(kernel='linear')
model.fit(X_resampled, y_resampled)

SVC(kernel='linear')

In [13]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.998945147679325

In [14]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test,y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        QSO       1.00      1.00      1.00      1.00      1.00      1.00      4740
       STAR       1.00      1.00      1.00      1.00      1.00      1.00      5399

avg / total       1.00      1.00      1.00      1.00      1.00      1.00     10139



In [15]:
# Display the confusion matrix
c= confusion_matrix(y_test,y_pred)
cmros = pd.DataFrame(c, index=["Actual - Galaxy", "Actual - QSO"], columns=["Predicted - Galaxy", "Predicted - QSO"])
cmros

Unnamed: 0,Predicted - Galaxy,Predicted - QSO
Actual - Galaxy,4730,10
Actual - QSO,0,5399


### Random Undersampling

In [16]:
# Resample the training data with the RandomUnderSampler
rus = RandomUnderSampler(random_state=1)
X_resampled1, y_resampled1 = rus.fit_resample(X_train, y_train)
Counter(y_resampled1)

Counter({'QSO': 14221, 'STAR': 14221})

### Support Vector Machine (Star vs. QSO) - Undersampling

In [17]:
model2 = SVC(kernel='linear')
model2.fit(X_resampled1, y_resampled1)

SVC(kernel='linear')

In [18]:
# Calculated the balanced accuracy score
y_pred1 = model2.predict(X_test)
balanced_accuracy_score(y_test, y_pred1)

0.9988396624472573

In [19]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test,y_pred1))

                   pre       rec       spe        f1       geo       iba       sup

        QSO       1.00      1.00      1.00      1.00      1.00      1.00      4740
       STAR       1.00      1.00      1.00      1.00      1.00      1.00      5399

avg / total       1.00      1.00      1.00      1.00      1.00      1.00     10139



In [20]:
# Display the confusion matrix
c= confusion_matrix(y_test,y_pred1)
cmrus = pd.DataFrame(c, index=["Actual - Galaxy", "Actual - QSO"], columns=["Predicted - Galaxy", "Predicted - QSO"])
cmrus

Unnamed: 0,Predicted - Galaxy,Predicted - QSO
Actual - Galaxy,4729,11
Actual - QSO,0,5399


In [21]:
# Write new df to new SQLite table -
new_data_df.to_sql("sq_svm", conn, if_exists="replace")
conn.close()

In [22]:
import pandas as pd
from config import password
from sqlalchemy import create_engine
engine = create_engine("postgresql://postgres:" + password + "@localhost:5432/Stellar_Classification")

In [23]:
# Create table with results in postgres 

new_data_df.to_sql("sq_svm", engine, if_exists="replace")