In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler

data = pd.read_csv('datasets\skyserver_sdss_dr16_100k.csv', skiprows=1)
data.head()

# Drop columns with technical IDs and dates
data = data.drop(columns=['objid','run','rerun','camcol','field','plate','fiberid','mjd'])
data.head()

data.describe()

Unnamed: 0,ra,dec,psfMag_u,psfMag_g,psfMag_r,psfMag_i,psfMag_z,redshift
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,192.303455,12.615915,16.612144,15.273162,15.593304,14.891084,13.948664,0.394117
std,87.481527,19.466995,230.646953,230.616205,202.830339,207.704427,221.707107,0.589979
min,0.003671,-11.250991,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-0.010875
25%,150.616732,-0.611347,20.662007,19.237815,18.482508,18.092835,17.753033,0.060255
50%,192.91017,1.181271,21.712175,20.19815,19.41591,19.028715,18.730685,0.15116
75%,240.48878,23.798464,23.454183,22.343803,21.109413,20.34466,19.90434,0.536983
max,359.99817,75.508784,28.93543,30.07198,26.01435,25.42449,24.32077,7.011245


In [2]:
# Remove outliers from psfMag_? columns
for c in ['psfMag_u', 'psfMag_g', 'psfMag_r', 'psfMag_i', 'psfMag_z']: 
    lower = data[c].quantile(0.01)
    upper = data[c].quantile(0.99)

    data = data[(data[c] < upper) & (data[c] > lower)]

data.describe()

# number of records per class is equal
records = len(data[data['class'] == 'QSO'])
print(records)

data = data.groupby('class').sample(n=records)
data.describe()


9663


Unnamed: 0,ra,dec,psfMag_u,psfMag_g,psfMag_r,psfMag_i,psfMag_z,redshift
count,28989.0,28989.0,28989.0,28989.0,28989.0,28989.0,28989.0,28989.0
mean,192.829143,14.259254,21.410581,20.25366,19.61997,19.256534,19.000142,0.641399
std,84.618686,19.685402,1.849498,1.648418,1.497768,1.40919,1.381569,0.920451
min,0.0053,-11.250991,17.31125,16.60714,16.47349,16.42101,16.36376,-0.010875
25%,149.6984,-0.327445,19.98256,19.04078,18.49875,18.18317,17.91581,0.000122
50%,192.84388,5.425955,21.2482,20.00952,19.41673,19.09884,18.90336,0.156842
75%,240.5452,29.299366,22.7978,21.48546,20.84328,20.35103,20.02918,0.924094
max,359.99328,75.508784,25.73573,24.37864,23.35278,22.49473,22.26959,7.011245


In [3]:
#training the model

#declare variables for training
x_data = data.drop(columns=['class'])
y_data = data['class']

#transforming class names to numerical values
le = preprocessing.LabelEncoder()
y_data=le.fit_transform(y_data)

#standardizing the feature values
ss = preprocessing.StandardScaler()
x_data = ss.fit_transform(data.drop(columns='class'))
print(x_data)

#splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3)

#creating the model
model = LogisticRegression(multi_class='ovr', solver='liblinear').fit(x_train, y_train)

#evauling the model
accuracy = model.score(x_test, y_test)
print('Model accuracy:', accuracy)

#getting the first 15 rows of the dataset
x_test_new = x_test[:15]

y_hat = model.predict(x_test_new)

print(y_hat)

y_hat_new =le.inverse_transform(y_hat)

print(y_hat_new)

[[-2.20649024 -0.89767086  0.98741412 ...  0.28690451  0.06734315
  -0.3068381 ]
 [-0.68325842  0.31133401 -0.09015105 ... -0.1331383  -0.20535878
  -0.50145649]
 [-0.39792988 -0.04172071 -0.85409595 ... -0.55835437 -0.49494036
  -0.58141139]
 ...
 [ 1.88136782 -0.72367284  1.27074998 ... -0.07880829 -0.51030721
  -0.6969105 ]
 [-0.8267519  -1.1282803  -0.74189599 ... -1.1955682  -1.11643293
  -0.69653105]
 [ 0.03927637 -0.68233458 -1.57853755 ... -1.49554707 -1.32500376
  -0.69641348]]
Model accuracy: 0.9043348281016442
[2 1 1 1 0 2 0 2 2 2 1 0 0 1 0]
['STAR' 'QSO' 'QSO' 'QSO' 'GALAXY' 'STAR' 'GALAXY' 'STAR' 'STAR' 'STAR'
 'QSO' 'GALAXY' 'GALAXY' 'QSO' 'GALAXY']
