In [1]:
import pandas as pd
import numpy as np
from astropy.table import Table
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from umap import UMAP
import matplotlib.patches as mpatches
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.cluster import HDBSCAN
import matplotlib.cm as cm
import time
import warnings
import pickle
from sklearn.metrics import ConfusionMatrixDisplay
warnings.filterwarnings('ignore')




## `Needed Function`

In [3]:
def calculate_pairwise_differences_any(data):
    ln = len(data.columns)
    for i in range(ln):
        for j in range(ln):
            if i < j:
                diff_name = f"{data.columns[i]}{data.columns[j]}"
                data[diff_name] = data[data.columns[i]] - data[data.columns[j]]
    return data

## `Data`

In [4]:
filename = 'FP18_data.fit'
df = Table.read(filename).to_pandas()
df = df.iloc[:,:-4]
df = pd.concat([df.iloc[:,:5], calculate_pairwise_differences_any(df.iloc[:,5:])], axis=1)

st = StandardScaler()
result1 = st.fit_transform(df.iloc[:,25:])
standard_df = pd.DataFrame(result1, columns=df.iloc[:,25:].columns)

## `UMAP+HDBSCAN Method`

In [5]:
umap = UMAP(n_components=10, n_neighbors=200)
model = umap.fit_transform(standard_df)

hdb = HDBSCAN(min_cluster_size=3000, min_samples=500)
clusters = hdb.fit_predict(model)

In [6]:
# Get all unique cluster labels (-1 is noise)
unique_labels, counts = np.unique(clusters, return_counts=True)

# Separate noise and clusters
noise_mask = (clusters == -1)
cluster_mask = ~noise_mask

print(f"Total objects: {len(clusters)}")
print(f"Noise points: {np.sum(noise_mask)} ({np.mean(noise_mask)*100:.1f}%)")
print(f"Clustered objects: {np.sum(cluster_mask)} ({np.mean(cluster_mask)*100:.1f}%)")

# Print size of each cluster
for label, count in zip(unique_labels, counts):
    if label == -1:
        print(f"Noise: {count} objects")
    else:
        print(f"Cluster {label}: {count} objects ({count/len(clusters)*100:.1f}%)")

Total objects: 48686
Noise points: 0 (0.0%)
Clustered objects: 48686 (100.0%)
Cluster 0: 7650 objects (15.7%)
Cluster 1: 9770 objects (20.1%)
Cluster 2: 3704 objects (7.6%)
Cluster 3: 27562 objects (56.6%)


In [8]:
# Using Spectroscopy data for identifying classes
df['hdb_label'] = clusters
df['hdb_label'] = df['hdb_label'].replace(3,1)
df['hdb_label'] = df['hdb_label'].replace(2,3)

In [9]:
print(classification_report(df['Hclass'] , df['hdb_label'], digits=4))

              precision    recall  f1-score   support

           0     0.9965    0.9860    0.9912      7731
           1     0.9795    0.9946    0.9870     36763
           3     0.9506    0.8399    0.8918      4192

    accuracy                         0.9799     48686
   macro avg     0.9755    0.9402    0.9567     48686
weighted avg     0.9797    0.9799    0.9795     48686

