## Supervised Machine Learning - Logistic Regression Algoritm (Star vs. QSO)

### Dependencies:

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from sqlalchemy import extract

In [2]:
columns = ["obj_ID",'alpha','delta', 'u', 'g', 'r', 'i', 'z', 'run_ID',
       'rerun_ID', 'cam_col', 'field_ID', 'spec_obj_ID', 'redshift',
       'plate', 'MJD', 'fiber_ID']
target = ["class"]

In [3]:
# # Create engine
# engine = create_engine("sqlite:///

In [4]:
# # reflect an existing database into a new model
# Base = automap_base()
# # reflect the tables
# Base.prepare(engine, reflect=True)
# # Save references to each table
# Measurement = Base.classes.measurement
# Station = Base.classes.station

In [5]:
# # Create our session (link) from Python to the DB
# session = Session(engine)

In [6]:
# Load the data
file_path = Path('Resources/star_qso.csv', index=False)
data_df = pd.read_csv(file_path)
# df = df.loc[:, columns].copy()
data_df

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237680e+18,340.995120,20.589476,23.48827,23.33776,21.32195,20.25615,19.54544,8102,301,3,110,5.658980e+18,QSO,1.424659,5026,55855,741
1,1.237680e+18,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,7773,301,2,462,1.250000e+19,QSO,0.586455,11069,58456,113
2,1.237670e+18,39.149691,28.102842,21.74669,20.03493,19.17553,18.81823,18.65422,5934,301,4,122,2.751760e+18,STAR,-0.000008,2444,54082,232
3,1.237680e+18,345.801874,32.672868,23.17274,20.14496,19.41948,19.22034,18.89359,8157,301,2,38,7.323010e+18,STAR,0.000072,6504,56540,574
4,1.237680e+18,353.201522,3.080796,24.54890,21.44267,20.95315,20.79360,20.48442,7712,301,5,284,4.822280e+18,STAR,-0.000429,4283,55864,178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26221,1.237680e+18,353.951560,19.899651,21.61376,21.72610,21.39746,21.15175,21.44469,8096,301,4,190,8.559290e+18,QSO,1.954003,7602,56954,716
26222,1.237660e+18,128.395984,25.467238,19.77835,19.37797,19.28145,19.12017,19.05171,4335,301,5,123,2.170890e+18,QSO,0.465272,1928,53327,562
26223,1.237660e+18,247.224915,38.037879,21.81185,21.44972,21.19963,21.36909,20.80581,3225,301,2,246,1.207990e+19,QSO,0.943422,10729,58248,324
26224,1.237660e+18,136.329001,6.426314,18.96920,18.76108,18.46569,18.48491,18.35552,3031,301,3,109,1.342220e+18,QSO,1.029967,1192,52649,530


In [7]:
data_df.set_index(['spec_obj_ID'], inplace = True)

In [8]:
data_df

Unnamed: 0_level_0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,class,redshift,plate,MJD,fiber_ID
spec_obj_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5.658980e+18,1.237680e+18,340.995120,20.589476,23.48827,23.33776,21.32195,20.25615,19.54544,8102,301,3,110,QSO,1.424659,5026,55855,741
1.250000e+19,1.237680e+18,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,7773,301,2,462,QSO,0.586455,11069,58456,113
2.751760e+18,1.237670e+18,39.149691,28.102842,21.74669,20.03493,19.17553,18.81823,18.65422,5934,301,4,122,STAR,-0.000008,2444,54082,232
7.323010e+18,1.237680e+18,345.801874,32.672868,23.17274,20.14496,19.41948,19.22034,18.89359,8157,301,2,38,STAR,0.000072,6504,56540,574
4.822280e+18,1.237680e+18,353.201522,3.080796,24.54890,21.44267,20.95315,20.79360,20.48442,7712,301,5,284,STAR,-0.000429,4283,55864,178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8.559290e+18,1.237680e+18,353.951560,19.899651,21.61376,21.72610,21.39746,21.15175,21.44469,8096,301,4,190,QSO,1.954003,7602,56954,716
2.170890e+18,1.237660e+18,128.395984,25.467238,19.77835,19.37797,19.28145,19.12017,19.05171,4335,301,5,123,QSO,0.465272,1928,53327,562
1.207990e+19,1.237660e+18,247.224915,38.037879,21.81185,21.44972,21.19963,21.36909,20.80581,3225,301,2,246,QSO,0.943422,10729,58248,324
1.342220e+18,1.237660e+18,136.329001,6.426314,18.96920,18.76108,18.46569,18.48491,18.35552,3031,301,3,109,QSO,1.029967,1192,52649,530


In [9]:
 new_data_df=data_df.drop(['alpha','delta','run_ID','rerun_ID', 'cam_col', 'field_ID', "obj_ID",'plate', 'MJD', 'fiber_ID'], axis = 1 )

In [10]:
new_data_df

Unnamed: 0_level_0,u,g,r,i,z,class,redshift
spec_obj_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5.658980e+18,23.48827,23.33776,21.32195,20.25615,19.54544,QSO,1.424659
1.250000e+19,21.46973,21.17624,20.92829,20.60826,20.42573,QSO,0.586455
2.751760e+18,21.74669,20.03493,19.17553,18.81823,18.65422,STAR,-0.000008
7.323010e+18,23.17274,20.14496,19.41948,19.22034,18.89359,STAR,0.000072
4.822280e+18,24.54890,21.44267,20.95315,20.79360,20.48442,STAR,-0.000429
...,...,...,...,...,...,...,...
8.559290e+18,21.61376,21.72610,21.39746,21.15175,21.44469,QSO,1.954003
2.170890e+18,19.77835,19.37797,19.28145,19.12017,19.05171,QSO,0.465272
1.207990e+19,21.81185,21.44972,21.19963,21.36909,20.80581,QSO,0.943422
1.342220e+18,18.96920,18.76108,18.46569,18.48491,18.35552,QSO,1.029967


### Variable engineering

In [11]:
frequency = [1, 2, 3]

u_size_bins = [-10000, -6600,-3310, 33]
new_data_df["binned_u"] = pd.cut(new_data_df["u"], u_size_bins, labels = frequency)

g_size_bins = [-10000, -6600,-3310, 33]
new_data_df["binned_g"] = pd.cut(new_data_df["g"], g_size_bins, labels = frequency)

r_size_bins = [9,16,23, 30]
new_data_df["binned_r"] = pd.cut(new_data_df["r"], r_size_bins, labels = frequency)

i_size_bins = [9,17,25, 33]
new_data_df["binned_i"] = pd.cut(new_data_df["i"], i_size_bins, labels = frequency)

z_size_bins = [-10000, -6600,-3310, 30]
new_data_df["binned_z"] = pd.cut(new_data_df["z"], z_size_bins, labels = frequency)

new_data_df.sample(30)

Unnamed: 0_level_0,u,g,r,i,z,class,redshift,binned_u,binned_g,binned_r,binned_i,binned_z
spec_obj_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3.5197e+18,20.33523,19.36324,19.06688,18.98289,18.90323,STAR,-0.000303,3,3,2,2,3
8.55929e+18,21.61376,21.7261,21.39746,21.15175,21.44469,QSO,1.954003,3,3,2,2,3
3.66491e+18,19.22676,17.92466,18.16374,18.35194,18.46638,STAR,0.000472,3,3,2,2,3
9.21778e+18,20.43789,20.81296,20.40849,20.42461,20.92204,QSO,1.158579,3,3,2,2,3
2.53901e+18,17.75926,16.70221,16.27975,16.08988,16.00259,STAR,-0.000169,3,3,2,1,3
8.02783e+17,20.80119,20.42105,19.94262,19.89781,19.88374,QSO,1.139278,3,3,2,2,3
3.59729e+18,19.34194,18.32243,18.03406,17.95591,17.91968,STAR,0.000142,3,3,2,2,3
7.31877e+17,17.54492,16.79643,16.25783,15.81848,15.59751,QSO,0.10282,3,3,2,1,3
2.59196e+18,20.87849,19.1178,18.29162,17.97499,17.77678,STAR,-3.7e-05,3,3,2,2,3
1.27006e+18,21.46335,19.21055,18.07038,17.59661,17.29424,STAR,-9e-06,3,3,2,2,3


### Test and Train Data

In [25]:
# Create our features
X = new_data_df.drop('class', axis=1)

# Create our target
y = new_data_df['class']

print(X.shape)
print(y.shape)
Counter(y)

(26226, 11)
(26226,)


Counter({'QSO': 12137, 'STAR': 14089})

In [13]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'STAR': 10599, 'QSO': 9070})

### Random Oversampling

In [14]:
# Resample the training data with the RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'STAR': 10599, 'QSO': 10599})

### Logestic Regression (Star vs. QSO) - Oversampling

In [15]:
# Train the Logestic Regression Model using the resampled data
model = LogisticRegression(solver='lbfgs', penalty='none', random_state=1, max_iter=10000)
model.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=10000, penalty='none', random_state=1)

In [16]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9998369742419302

In [17]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test,y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        QSO       1.00      1.00      1.00      1.00      1.00      1.00      3067
       STAR       1.00      1.00      1.00      1.00      1.00      1.00      3490

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      6557



In [28]:
# Display the confusion matrix
c= confusion_matrix(y_test,y_pred)
cmros = pd.DataFrame(c, index=["Actual - QSO", "Actual - Star"], columns=["Predicted - QSO", "Predicted - Star"])
cmros

Unnamed: 0,Predicted - QSO,Predicted - Star
Actual - QSO,3066,1
Actual - Star,0,3490


### Random Undersampling

In [19]:
# Resample the training data with the RandomUnderSampler
rus = RandomUnderSampler(random_state=1)
X_resampled1, y_resampled1 = rus.fit_resample(X_train, y_train)
Counter(y_resampled1)

Counter({'QSO': 9070, 'STAR': 9070})

### Logestic Regression (Star vs. QSO) - Undersampling

In [20]:
model2 = LogisticRegression(solver='lbfgs', penalty='none', random_state=1, max_iter=10000)
model2.fit(X_resampled1, y_resampled1)

LogisticRegression(max_iter=10000, penalty='none', random_state=1)

In [21]:
# Calculated the balanced accuracy score
y_pred1 = model2.predict(X_test)
balanced_accuracy_score(y_test, y_pred1)

0.9998369742419302

In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test,y_pred1))

                   pre       rec       spe        f1       geo       iba       sup

        QSO       1.00      1.00      1.00      1.00      1.00      1.00      3067
       STAR       1.00      1.00      1.00      1.00      1.00      1.00      3490

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      6557



In [27]:
# Display the confusion matrix
c= confusion_matrix(y_test,y_pred1)
cmrus = pd.DataFrame(c, index=["Actual - QSO", "Actual - Star"], columns=["Predicted - QSO", "Predicted - Star"])
cmrus

Unnamed: 0,Predicted - QSO,Predicted - Star
Actual - QSO,3066,1
Actual - Star,0,3490
