**ALL ABOUT THE DATASET:**

The “Stellar Classification Dataset — SDSS17” by FEDESORIANO is made from the “Sloan Digital
Sky Survey” project and contains spectroscopic observations of celestial objects in the night sky.
The dataset consists of 100,000 observations, each described by 17 feature columns and 1 class
column that identifies it as a star, galaxy, or quasar.
Quasars, short for “quasi-stellar radio sources,” are incredibly bright and distant astronomical
objects found at the centers of galaxies. Quasars are not stars but the active cores of distant
galaxies powered by supermassive black holes.

**Data Dictionary**

obj_ID = Object Identifier, the unique value that identifies the object in the image catalog used by the CAS ( Target variable)

alpha = Right Ascension angle (at J2000 epoch)

delta = Declination angle (at J2000 epoch)

u = Ultraviolet filter in the photometric system

g = Green filter in the photometric system

r = Red filter in the photometric system

i = Near Infrared filter in the photometric system

z = Infrared filter in the photometric system

run_ID = Run Number used to identify the specific scan

rereun_ID = Rerun Number to specify how the image was processed

cam_col = Camera column to identify the scanline within the run

field_ID = Field number to identify each field

spec_obj_ID = Unique ID used for optical spectroscopic objects (this means that 2 different observations with the same spec_obj_ID must share the output class)

class = object class (galaxy, star or quasar object) – target variable – 3 class classifier

redshift = redshift value based on the increase in wavelength

plate = plate ID, identifies each plate in SDSS

MJD = Modified Julian Date, used to indicate when a given piece of SDSS data was taken

fiber_ID = fiber ID that identifies the fiber that pointed the light at the focal plane in each observation

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from matplotlib import pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import plotly.figure_factory as ff
from sklearn.preprocessing import label_binarize

In [None]:
!pip install summarytools

Collecting summarytools
  Downloading summarytools-0.2.3.tar.gz (11 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting jedi>=0.16 (from ipython>=7.20.0->summarytools)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: summarytools
  Building wheel for summarytools (pyproject.toml) ... [?25l[?25hdone
  Created wheel for summarytools: filename=summarytools-0.2.3-py3-none-any.whl size=8572 sha256=d38894fd531e8dc44f6b8aa2d22e7e4508766436504889a742a0e050aa3341d1
  Stored in directory: /root/.cache/pip/wheels/d5/8a/5d/008e2127a7f9dd64d066f16e5ea268fa52823dc929c982d6f1
Successfully built summarytools
Installing collected packages: jedi, summarytools
Successfully installed jedi-0.19.1 summarytools-0

In [None]:
from summarytools import dfSummary

In [None]:
from google.colab import drive
drive.mount('/content/drive')#Mount the drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/My Drive/Advanced Applied machine learning/star_classification.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

Exploratory Data Analysis (EDA)

In [None]:
df.sample(8)

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
41875,1.237659e+18,239.567384,39.402618,24.63467,20.72572,19.03342,18.33362,18.20663,3180,301,3,211,5.845886e+18,GALAXY,0.226434,5192,56066,778
95209,1.237661e+18,183.891225,46.268753,19.50289,18.49177,18.19354,18.0725,18.00694,3698,301,1,164,8.359954e+18,STAR,0.00034,7425,56777,534
98722,1.237661e+18,145.450266,39.118061,20.8966,19.96199,19.89736,19.89189,19.99044,3530,301,1,189,3.630054e+18,STAR,0.000629,3224,54849,557
56514,1.237658e+18,135.05035,4.275687,23.76919,23.75406,21.24608,20.0053,19.56291,3015,301,2,140,4.294446e+18,GALAXY,0.571624,3814,55535,961
8688,1.237659e+18,208.82363,51.942133,22.99855,21.18974,20.97836,20.93164,20.58851,3180,301,2,47,7.58884e+18,STAR,0.00035,6740,56401,998
95224,1.237679e+18,345.298478,22.973691,21.82521,20.35423,20.2457,20.15662,19.60457,7708,301,2,115,7.421013e+18,QSO,2.37508,6591,56535,752
46816,1.237679e+18,25.524589,11.618876,21.10785,21.15398,20.814,20.74949,20.78499,7773,301,3,477,1.245362e+19,QSO,1.223312,11061,58428,155
33466,1.237662e+18,168.682439,42.223693,19.8504,19.68569,19.37961,19.36574,19.36136,3840,301,6,95,9.420612e+18,QSO,1.17088,8367,57429,755


In [None]:
# Check for missing values
print(df.isnull().sum())
# Basic statistics of the dataset
print(df.describe())
display(df)

obj_ID         0
alpha          0
delta          0
u              0
g              0
r              0
i              0
z              0
run_ID         0
rerun_ID       0
cam_col        0
field_ID       0
spec_obj_ID    0
class          0
redshift       0
plate          0
MJD            0
fiber_ID       0
dtype: int64
             obj_ID          alpha          delta              u  \
count  1.000000e+05  100000.000000  100000.000000  100000.000000   
mean   1.237665e+18     177.629117      24.135305      21.980468   
std    8.438560e+12      96.502241      19.644665      31.769291   
min    1.237646e+18       0.005528     -18.785328   -9999.000000   
25%    1.237659e+18     127.518222       5.146771      20.352353   
50%    1.237663e+18     180.900700      23.645922      22.179135   
75%    1.237668e+18     233.895005      39.901550      23.687440   
max    1.237681e+18     359.999810      83.000519      32.781390   

                   g              r              i              z  \

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.152200e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.237680e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1.237679e+18,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,7778,301,2,581,1.055431e+19,GALAXY,0.000000,9374,57749,438
99996,1.237679e+18,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,7917,301,1,289,8.586351e+18,GALAXY,0.404895,7626,56934,866
99997,1.237668e+18,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,5314,301,4,308,3.112008e+18,GALAXY,0.143366,2764,54535,74
99998,1.237661e+18,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,3650,301,4,131,7.601080e+18,GALAXY,0.455040,6751,56368,470


In [None]:
dfSummary(df)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,obj_ID [float64],Mean (sd) : 1237664721814903296.0 (8438559894562.6) min < med < max: 1237645942904389888.0 < 1237663463144292864.0 < 1237680531356386304.0 IQR (CV) : 9189091327744.0 (146667.8),"78,053 distinct values",,0 (0.0%)
2,alpha [float64],Mean (sd) : 177.6 (96.5) min < med < max: 0.0 < 180.9 < 360.0 IQR (CV) : 106.4 (1.8),"99,999 distinct values",,0 (0.0%)
3,delta [float64],Mean (sd) : 24.1 (19.6) min < med < max: -18.8 < 23.6 < 83.0 IQR (CV) : 34.8 (1.2),"99,999 distinct values",,0 (0.0%)
4,u [float64],Mean (sd) : 22.0 (31.8) min < med < max: -9999.0 < 22.2 < 32.8 IQR (CV) : 3.3 (0.7),"93,748 distinct values",,0 (0.0%)
5,g [float64],Mean (sd) : 20.5 (31.8) min < med < max: -9999.0 < 21.1 < 31.6 IQR (CV) : 3.2 (0.6),"92,651 distinct values",,0 (0.0%)
6,r [float64],Mean (sd) : 19.6 (1.9) min < med < max: 9.8 < 20.1 < 29.6 IQR (CV) : 2.9 (10.6),"91,901 distinct values",,0 (0.0%)
7,i [float64],Mean (sd) : 19.1 (1.8) min < med < max: 9.5 < 19.4 < 32.1 IQR (CV) : 2.7 (10.9),"92,019 distinct values",,0 (0.0%)
8,z [float64],Mean (sd) : 18.7 (31.7) min < med < max: -9999.0 < 19.0 < 29.4 IQR (CV) : 2.5 (0.6),"92,007 distinct values",,0 (0.0%)
9,run_ID [int64],Mean (sd) : 4481.4 (1964.8) min < med < max: 109.0 < 4188.0 < 8162.0 IQR (CV) : 2139.0 (2.3),430 distinct values,,0 (0.0%)
10,rerun_ID [int64],1. 301,"100,000 (100.0%)",,0 (0.0%)


Use px.histogram to visualize the distribution of your target variable 'class'. This gives you an idea of the balance between different classes.

In [None]:
class_distribution = px.histogram(df, x='class', color='class', title='Class Distribution')
class_distribution.show()

To understand the relationship between different features, you can create a correlation heatmap using Plotly.

In [None]:
import plotly.graph_objects as go

corr_matrix = df.corr()
fig = go.Figure(data=go.Heatmap(
                   z=corr_matrix,
                   x=corr_matrix.columns,
                   y=corr_matrix.columns,
                   hoverongaps=False,
                   colorscale='blues'))
fig.update_layout(title='Feature Correlation Matrix', xaxis_title="Features", yaxis_title="Features")
fig.show()

Visualize the distribution of individual features using histograms. This can help identify outliers, skewness, etc.

In [None]:
feature_example1 = px.histogram(df, x=df.columns[1], title=f'Distribution of {df.columns[1]}')
feature_example1.show()


In [None]:
feature_example2 = px.histogram(df, x=df.columns[2], title=f'Distribution of {df.columns[2]}')
feature_example2.show()

In [None]:
# mapping from class names to numerical values
class_mapping = {'GALAXY': 0, 'QSO': 1, 'STAR': 2}
# changing the class names to numerical values in the 'class' column
df['class'] = df['class'].replace(class_mapping)

In [None]:

# For this dataset, 'class' is the target and already encoded

# Splitting the dataset into features and target variable
X = df.drop(['class'], axis=1)
y = df['class']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# Instantiate the XGBoost classifier for multi-class classification
model = XGBClassifier(booster='gbtree', objective='multi:softmax', num_class=3, random_state=2)

# Specify the evaluation set
eval_set = [(X_test, y_test)]

# Use a suitable evaluation metric for multi-class classification
eval_metric = 'mlogloss'  # or 'merror'
model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set)

# Make predictions for test data
y_pred = model.predict(X_test)

# Calculate the F1 score with average parameter for multi-class
f1 = f1_score(y_test, y_pred, average='weighted')  # 'weighted' accounts for label imbalance

# Print the F1 score
print("F1 Score: %.2f" % f1)

[0]	validation_0-mlogloss:0.73090
[1]	validation_0-mlogloss:0.52384
[2]	validation_0-mlogloss:0.39080
[3]	validation_0-mlogloss:0.30168
[4]	validation_0-mlogloss:0.23917
[5]	validation_0-mlogloss:0.19489
[6]	validation_0-mlogloss:0.16306
[7]	validation_0-mlogloss:0.14006
[8]	validation_0-mlogloss:0.12333
[9]	validation_0-mlogloss:0.11128
[10]	validation_0-mlogloss:0.10224
[11]	validation_0-mlogloss:0.09573
[12]	validation_0-mlogloss:0.09094
[13]	validation_0-mlogloss:0.08734
[14]	validation_0-mlogloss:0.08489
[15]	validation_0-mlogloss:0.08300
[16]	validation_0-mlogloss:0.08152
[17]	validation_0-mlogloss:0.08040
[18]	validation_0-mlogloss:0.07933
[19]	validation_0-mlogloss:0.07852
[20]	validation_0-mlogloss:0.07777
[21]	validation_0-mlogloss:0.07727
[22]	validation_0-mlogloss:0.07688
[23]	validation_0-mlogloss:0.07638
[24]	validation_0-mlogloss:0.07610
[25]	validation_0-mlogloss:0.07568
[26]	validation_0-mlogloss:0.07552
[27]	validation_0-mlogloss:0.07536
[28]	validation_0-mlogloss:0.0

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import numpy as np

def tune_and_train_xgboost_model(X_train, y_train, X_test, y_test):
    # Expanded parameter grid for XGBoost
    xgb_param_grid = {
        'n_estimators': np.arange(50, 400, 50),
        'max_depth': np.arange(3, 15),
        'learning_rate': np.linspace(0.01, 0.3, 10),
        'subsample': np.linspace(0.6, 1.0, 5),
        'min_child_weight': np.arange(1, 10, 2),
        'gamma': np.linspace(0, 0.5, 5),
        'colsample_bytree': np.linspace(0.6, 1.0, 5),
        'reg_alpha': np.linspace(0, 1, 5)
    }

    # Initialize the XGBoost classifier with additional settings to handle warnings, etc.
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    # Perform Randomized Search with the expanded grid
    xgb_random_search = RandomizedSearchCV(xgb, xgb_param_grid, n_iter=10, scoring='f1_macro', cv=5, verbose=2, random_state=42, n_jobs=-1)
    xgb_random_search.fit(X_train, y_train)

    # Extract the best model
    best_model = xgb_random_search.best_estimator_

    # Make predictions with the best model
    y_pred = best_model.predict(X_test)

    # Calculate F1 score
    f1 = f1_score(y_test, y_pred, average='macro')

    print('Best parameters:', xgb_random_search.best_params_)
    print('Best F1 Score:', f1)

    return best_model


best_model = tune_and_train_xgboost_model(X_train, y_train, X_test, y_test)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters: {'subsample': 1.0, 'reg_alpha': 0.5, 'n_estimators': 200, 'min_child_weight': 7, 'max_depth': 12, 'learning_rate': 0.07444444444444444, 'gamma': 0.125, 'colsample_bytree': 1.0}
Best F1 Score: 0.9738236926855697


since the original function already returns the best-trained model (best_model), directly using best_model for predictions or further evaluations is the most straightforward approach.

In [None]:
# Step 1: Get the best model (already trained with the best parameters)
best_model = tune_and_train_xgboost_model(X_train, y_train, X_test, y_test)

# If you need to retrain the model or want to explicitly show the training with the best parameters, you can do the following:
# Extract the best parameters from the model (Not necessary if using the model as is)
best_params = best_model.get_params()

# Initialize a new XGBoost model with these best parameters
new_best_xgb_model = XGBClassifier(**best_params)

# Train this new model (Optional if you are using the model returned by the function)
new_best_xgb_model.fit(X_train, y_train)

# Now, `new_best_xgb_model` is your trained XGBoost model with the best parameters found


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters: {'subsample': 1.0, 'reg_alpha': 0.5, 'n_estimators': 200, 'min_child_weight': 7, 'max_depth': 12, 'learning_rate': 0.07444444444444444, 'gamma': 0.125, 'colsample_bytree': 1.0}
Best F1 Score: 0.9738236926855697


To visualize the outputs for the best XGBoost model, including the classification report, confusion matrices for both the test and train sets, feature importance plot, and ROC curve, you would typically use a combination of Matplotlib, Seaborn, or Plotly for plotting and sklearn for model evaluation metrics. However, Plotly does not directly support text-based outputs like classification reports, so for that part, we'll focus on generating visual representations for the confusion matrix, feature importance, and ROC curve.

label_binarize is used to binarize the output labels for a multi-class ROC curve.
A ROC curve is computed for each class separately using a One-vs-Rest approach.
All ROC curves are plotted with different colors for each class.

In [None]:
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import label_binarize
from itertools import cycle



# Making predictions
predictions_test = best_model.predict(X_test)
predictions_train = best_model.predict(X_train)

# Generating confusion matrices
confusion_matrix_test = confusion_matrix(y_test, predictions_test)
confusion_matrix_train = confusion_matrix(y_train, predictions_train)

# Displaying the classification report
print(classification_report(y_test, predictions_test))

# Plotting Confusion Matrix for the Test Set
fig_cm_test = ff.create_annotated_heatmap(z=confusion_matrix_test, colorscale='Viridis')
fig_cm_test.update_layout(title_text='Confusion Matrix for Test Set', xaxis_title="Predicted", yaxis_title="Actual")
fig_cm_test.show()

# Plotting Confusion Matrix for the Train Set
fig_cm_train = ff.create_annotated_heatmap(z=confusion_matrix_train, colorscale='Viridis')
fig_cm_train.update_layout(title_text='Confusion Matrix for Train Set', xaxis_title="Predicted", yaxis_title="Actual")
fig_cm_train.show()

# Plotting Feature Importance
feature_names = best_model.get_booster().feature_names
feature_importances = best_model.feature_importances_
fig_importances = go.Figure([go.Bar(x=feature_names, y=feature_importances)])
fig_importances.update_layout(title='Feature Importance', xaxis_title="Features", yaxis_title="Importance Score")
fig_importances.show()

y_bin = label_binarize(y_test, classes=np.unique(y_train))
n_classes = y_bin.shape[1]

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

# Predict probabilities for each class
y_score = best_model.predict_proba(X_test)

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot all ROC curves
fig = go.Figure()

colors = cycle(['blue', 'red', 'green', 'cyan', 'magenta', 'yellow', 'black', 'pink', 'lightblue', 'orange'])

for i, color in zip(range(n_classes), colors):
    fig.add_trace(go.Scatter(x=fpr[i], y=tpr[i], mode='lines',
                             name=f'ROC curve of class {i} (area = {roc_auc[i]:.2f})',
                             line=dict(color=color)))

fig.update_layout(title='Multiclass ROC Curve',
                  xaxis_title='False Positive Rate',
                  yaxis_title='True Positive Rate')

fig.show()

              precision    recall  f1-score   support

           0       0.98      0.99      0.98     11860
           1       0.97      0.93      0.95      3797
           2       0.99      1.00      0.99      4343

    accuracy                           0.98     20000
   macro avg       0.98      0.97      0.97     20000
weighted avg       0.98      0.98      0.98     20000



In [None]:
from google.colab import drive
import joblib

# Mount Google Drive
drive.mount('/content/drive')

# Specify the Google Drive path where you want to save the model
google_drive_path = '/content/drive/MyDrive/Advanced Applied machine learning/'

# Save the final XGBoost model as a pickle file in Google Drive
model_filename = 'Assignment-1.pkl'
print(f"Assignment-1: {best_model}")

joblib.dump(best_model, google_drive_path + model_filename)

print(f"Assignmen1 saved to Drive at '{google_drive_path + model_filename}'")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Assignment-1: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=0.125, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.07444444444444444, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=12, max_leaves=None,
              min_child_weight=7, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
Assignmen1 saved to Drive at '/content/drive/MyDrive/Advanced 