In [2]:
import numpy as np
import pandas as pd

from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier

from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import EasyEnsembleClassifier

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from sqlalchemy import extract

In [5]:
columns = ["obj_ID",'alpha','delta', 'u', 'g', 'r', 'i', 'z', 'run_ID',
       'rerun_ID', 'cam_col', 'field_ID', 'spec_obj_ID', 'redshift',
       'plate', 'MJD', 'fiber_ID']
target = ["class"]

In [None]:
# Create engine
engine = create_engine("sqlite:///

In [None]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)
# Save references to each table
Measurement = Base.classes.measurement
Station = Base.classes.station

In [None]:
# Create our session (link) from Python to the DB
session = Session(engine)

In [8]:
# Load the data
file_path = Path('sample_data.csv', index=False)
df = pd.read_csv(file_path)
# df = df.loc[:, columns].copy()
df

Unnamed: 0.1,Unnamed: 0,obj_id,alpha,delta,u,g,r,i,z,run_id,rerun_id,cam_col,field_id,spec_obj_id,class,redshift,plate,mjd,fiber_id
0,0,1.237661e+18,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,2,1.237661e+18,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.152200e+18,GALAXY,0.644195,4576,55592,299
3,3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,4,1.237680e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,1.237666e+18,348.335815,25.685581,23.93912,22.92743,21.78178,20.69582,19.83773,4822,301,3,437,8.675069e+18,GALAXY,0.686254,7705,57332,35
96,96,1.237671e+18,42.048515,32.466019,21.80150,18.61152,17.18664,16.54847,16.14959,5934,301,4,155,3.614186e+18,STAR,0.000037,3210,54876,173
97,97,1.237662e+18,169.417236,7.719850,25.08139,22.03635,21.83119,21.71758,21.24758,3841,301,5,31,6.045073e+18,STAR,0.000266,5369,56272,422
98,98,1.237662e+18,243.498908,4.309792,21.23949,20.39086,20.35097,20.25421,19.90627,3910,301,4,222,5.412459e+18,QSO,2.290692,4807,55687,940


In [9]:
# Create our features
X = df.drop('class', axis=1)

# Create our target
y = df['class']

In [10]:
X.describe()

Unnamed: 0.1,Unnamed: 0,obj_id,alpha,delta,u,g,r,i,z,run_id,rerun_id,cam_col,field_id,spec_obj_id,redshift,plate,mjd,fiber_id
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,49.5,1.237666e+18,190.916135,22.497345,22.867176,21.199389,20.084953,19.42221,18.998426,4864.34,301.0,3.55,172.13,6.113754e+18,0.575204,5429.99,55770.3,466.61
std,29.011492,8299307000000.0,104.182062,16.340209,1.873193,1.561941,1.589658,1.500648,1.535673,1932.349474,0.0,1.250252,115.109053,3.229987e+18,0.597542,2868.790948,1716.179763,277.20954
min,0.0,1.237652e+18,1.494389,-8.043402,18.75124,17.35669,16.49747,15.97711,15.54461,1412.0,301.0,1.0,12.0,4.234019e+17,-0.000608,376.0,51929.0,10.0
25%,24.75,1.237661e+18,132.8994,7.680188,21.71978,20.371858,19.008765,18.507205,17.992513,3688.5,301.0,3.0,91.25,4.299382e+18,0.151202,3818.5,55498.75,222.25
50%,49.5,1.237663e+18,180.671973,22.094394,22.78161,21.48681,20.32061,19.489825,18.96661,4059.5,301.0,4.0,152.0,5.652657e+18,0.482437,5020.5,55893.5,426.0
75%,74.25,1.237674e+18,252.680626,32.785846,24.407297,22.319085,21.105117,20.323055,19.850908,6573.25,301.0,4.0,223.5,8.237289e+18,0.686348,7316.0,56710.0,742.0
max,99.0,1.237681e+18,357.151519,67.74745,27.0971,24.89395,24.8026,23.17828,22.82647,8157.0,301.0,6.0,518.0,1.278691e+19,2.689002,11357.0,58522.0,997.0


In [12]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'GALAXY': 51, 'STAR': 51, 'QSO': 51})

In [17]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [18]:
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5925925925925926