# Training the XG-Boost

Loading relevant modules:

In [1]:
from astropy.io import fits
import numpy as np
import matplotlib.pyplot as plt
import os, sys
from astropy.table import Table
from astropy.io import fits
from os import listdir
import pandas as pd
from astropy import units as u
from astropy.table import Table
from itertools import chain
import re
from astropy.coordinates import SkyCoord
from astroML.crossmatch import crossmatch_angular
from tqdm import tqdm
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

In [2]:
main_path = sys.path[0].replace('/code','')
os.chdir(main_path)

Loading the dataset:

In [3]:
alldata = Table.read(main_path+'/data/final_cat_allred.fits')
alldata = alldata.to_pandas()

In [4]:
print(alldata.describe())

                qid           RAd          DECd        z_spec  \
count  7.809017e+06  7.809018e+06  7.809018e+06  38495.000000   
mean   1.743823e+07  1.981461e+02 -7.789233e+00      0.633401   
std    9.483574e+06  1.026036e+02  1.282100e+01      0.858831   
min    9.369000e+03  1.310232e-05 -3.095775e+01     -0.001600   
25%    9.172577e+06  1.373865e+02 -1.882673e+01      0.104200   
50%    1.740215e+07  2.115137e+02 -7.945979e+00      0.162200   
75%    2.589552e+07  3.031893e+02  3.149877e+00      0.971000   
max    3.337563e+07  3.599999e+02  1.500000e+01      5.170000   

       Q1_SkyM11_mag_u  Q1_SkyM11_mag_v  Q1_SkyM11_mag_g  Q1_SkyM11_mag_r  \
count     28665.000000     27263.000000    114395.000000    139528.000000   
mean         17.561028        17.160528        17.253969        16.957958   
std           0.787743         0.818146         1.176948         1.248276   
min          12.686000        12.855000        13.327000        13.731000   
25%          16.958000       

In [5]:
alldata.objtype = alldata.objtype.str.decode("utf-8")

Labelling stars:

In [6]:

alldata.objtype[np.abs(alldata.parallax_over_error) > 3*np.std(alldata.parallax_over_error)] = 'Star'
alldata.objtype[np.abs(alldata.pm/alldata.pmra_error) > 3*np.std(alldata.pm/alldata.pmra_error)] = 'Star'
alldata.objtype[np.abs(alldata.pm/alldata.pmdec_error) > 3*np.std(alldata.pm/alldata.pmdec_error)] = 'Star'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alldata.objtype[np.abs(alldata.parallax_over_error) > 3*np.std(alldata.parallax_over_error)] = 'Star'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alldata.objtype[np.abs(alldata.pm/alldata.pmra_error) > 3*np.std(alldata.pm/alldata.pmra_error)] = 'Star'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alldata.objtype[np.abs(alldata.pm/alldata.pmdec_error) > 3*np.std(alldata.pm/alldata.pmdec_error)] = 'Star'


Check the different classes within the data:

In [7]:
print(np.unique(alldata["objtype"]))
for u, uni in enumerate(np.unique(alldata["objtype"])):
    alldata[alldata["objtype"] == uni] = u -1
print(np.unique(alldata["objtype"]))

[' ' 'BLLac' 'EmLines' 'Extended?' 'Galaxy' 'QSO' 'Star' 'Type2'
 'Uncertain']
[-1 0 1 2 3 4 5 6 7]


Split the data into unclassified and classified data:

In [8]:
unclass_data = alldata[alldata.objtype == -1]
class_data = alldata[alldata.objtype != -1]
print(len(unclass_data), len(class_data))
print(class_data.keys().to_list())

7371992 437026
['qid', 'RAd', 'DECd', 'objtype', 'z_spec', 'Q1_SkyM11_mag_u', 'Q1_SkyM11_mag_v', 'Q1_SkyM11_mag_g', 'Q1_SkyM11_mag_r', 'Q1_SkyM11_mag_i', 'Q1_SkyM11_mag_z', 'Q1_SkyM3_mag_u', 'Q1_SkyM3_mag_v', 'Q1_SkyM3_mag_g', 'Q1_SkyM3_mag_r', 'Q1_SkyM3_mag_i', 'Q1_SkyM3_mag_z', 'PanSTARRS1DR2_mag_g', 'PanSTARRS1DR2_mag_r', 'PanSTARRS1DR2_mag_i', 'PanSTARRS1DR2_mag_z', 'PanSTARRS1DR2_mag_Y', 'DES_Y3_Gold_mag_g', 'DES_Y3_Gold_mag_r', 'DES_Y3_Gold_mag_i', 'DES_Y3_Gold_mag_z', 'DES_Y3_Gold_mag_Y', 'gaia_G', 'gaia_RP', 'gaia_BP', 'wise_w1', 'wise_w2', 'wise_w3', 'wise_w4', 'wise12_w1', 'wise12_w2', 'gaia_match', 'gaia_double', 'panstarrs_match', 'panstarrs_double', 'wise_match', 'wise_double', '_id_x', 'ra_x', 'dec_x', 'ra_error', 'dec_error', 'source_id_x', 'pmra_x', 'pmdec_x', 'pm', 'pmra_error', 'pmdec_error', 'parallax', 'parallax_error', 'parallax_over_error', 'phot_bp_mean_mag', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_rp_mean_mag', 'phot_rp_mean_flux', 'phot_rp_mean_fl

In [9]:
features = ['w1mpro','w2mpro','w3mpro',
            'gPSFMag','rPSFMag','iPSFMag','zPSFMag','yPSFMag',
            'phot_rp_mean_mag','phot_g_mean_mag','phot_bp_mean_mag']
target = 'objtype'
for f in features:
    class_data = class_data[np.abs(class_data[f]) < 50]
class_features = class_data[features].to_numpy()
class_targets = class_data[target].to_numpy()

In [10]:
rs = 27
model = xgb.XGBClassifier()
Xtrain, Xtest, ytrain, ytest = train_test_split(class_features, class_targets, 
                                                random_state=rs, test_size=0.2, train_size=0.8)
print(len(Xtest),len(ytest))
print(np.unique(ytest))

87406 87406
[0 1 2 3 4 5 6 7]


In [11]:
num_class = len(np.unique(class_data["objtype"]))
print(num_class)
param = {'max_depth': 20, 
         'objective': 'multi:softprob', 
         'eval_metric':'mlogloss',
         'num_class': num_class}

8


In [12]:
Dtrain = xgb.DMatrix(Xtrain, label=ytrain)
Dtest = xgb.DMatrix(Xtest, label=ytest)
print(np.unique(ytrain))

[0 1 2 3 4 5 6 7]


In [13]:
model = xgb.train(dtrain=Dtrain,num_boost_round=20,params=param)

In [14]:
preds = model.predict(Dtest)
best_preds = [np.argmax(line) for line in preds]

cf = confusion_matrix(ytest.tolist(), best_preds)
print(cf)

print("Precision = {}".format(precision_score(ytest.tolist(), best_preds, average='macro')))
print("Recall = {}".format(recall_score(ytest.tolist(), best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(ytest.tolist(), best_preds)))

[[   35     0     0     0     0     0     0     0]
 [    0     9     0     0     0     0     0     0]
 [    0     0 24671     0     0     0     0     0]
 [    0     0     0  3813     0     0     0     0]
 [    0     0     0     0  3473     0     0     0]
 [    0     0     0     0     0 54996     0     0]
 [    0     0     0     0     0     0   408     0]
 [    0     0     0     0     0     0     0     1]]
Precision = 1.0
Recall = 1.0
Accuracy = 1.0
