In [1]:
import os
import spender
import torch
from accelerate import Accelerator
import shutil 
from tqdm import tqdm 

# Hardware optimization
accelerator = Accelerator(mixed_precision='fp16')

# Get code, instrument, and pretrained spectrum model from the hub
sdss, model = spender.hub.load('sdss_II', map_location=accelerator.device)

# Move the model to the accelerator device (CUDA)
model.to(accelerator.device)

# Get some SDSS spectra from the ids, store locally in data_path
data_path = "./DATA"

root_dir = r"C:\Users\tkiker\Documents\GitHub\AGN-UMAP\data"

# Initialize an empty list to store the IDs
ids = []

# Walk through the directory and its subdirectories
for subdir, dirs, files in os.walk(root_dir):
    for file in files:
        # Check if the file is in the expected format
        if file.startswith("spec-") and file.endswith(".fits"):
            # Extract the plate, mjd, and fiberID from the file name
            parts = file.split('-')
            plate = int(parts[1])
            mjd = int(parts[2])
            fiberID = int(parts[3].split('.')[0])
            # Append the tuple to the list
            ids.append((plate, mjd, fiberID))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
spec, w, z, norm, zerr = sdss.make_batch(data_path, ids)

# Move tensors to the accelerator device
spec = spec.to(accelerator.device)
z = z.to(accelerator.device)

# For more fine-grained control, run spender's internal _forward method
# which return the latents s, the model for the restframe, and the observed spectrum
with torch.no_grad():
    s, spec_rest, spec_reco = model._forward(spec, instrument=sdss, z=z)

In [11]:
import numpy as np
import umap
import hdbscan
import matplotlib.pyplot as plt

# Assuming 's' is your output tensor from PyTorch
# Move the tensor to the CPU and convert it to a NumPy array
s_np = s.cpu().detach().numpy()

# Check for NaN values and remove rows with NaN values
s_np = s_np[~np.isnan(s_np).any(axis=1)]

# Step 1: Use UMAP to reduce dimensions to 2
reducer = umap.UMAP(n_components=2)
s_umap = reducer.fit_transform(s_np)

# Step 2: Use HDBSCAN to cluster the reduced dimensions
clusterer = hdbscan.HDBSCAN()
labels = clusterer.fit_predict(s_umap)

# Step 3: Plot the results with each cluster in different color
plt.figure(figsize=(10, 8))
plt.title('UMAP Dimensionality Reduction and HDBSCAN Clustering')

# Generate a scatter plot with different colors for each cluster
unique_labels = np.unique(labels)
for label in unique_labels:
    if label == -1:
        # HDBSCAN labels noise points as -1
        color = 'black'
        label_name = 'Noise'
    else:
        color = plt.cm.Spectral(float(label) / len(unique_labels))
        label_name = f'Cluster {label}'
    
    plt.scatter(s_umap[labels == label, 0], s_umap[labels == label, 1], 
                c=[color], label=label_name, alpha=0.5, edgecolors='w', s=100)

plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.legend()
plt.show()


ValueError: Found array with 0 sample(s) (shape=(0, 6)) while a minimum of 1 is required.

In [7]:
import numpy as np
import umap
import hdbscan
import matplotlib.pyplot as plt

# Assuming 's' is your output tensor from PyTorch
# Move the tensor to the CPU and convert it to a NumPy array
s_np = s.cpu().detach().numpy()

# Step 1: Use UMAP to reduce dimensions to 2
reducer = umap.UMAP(n_components=2)
s_umap = reducer.fit_transform(s_np)

# Step 2: Use HDBSCAN to cluster the reduced dimensions
clusterer = hdbscan.HDBSCAN()
labels = clusterer.fit_predict(s_umap)

# Step 3: Plot the results with each cluster in different color
plt.figure(figsize=(10, 8))
plt.title('UMAP Dimensionality Reduction and HDBSCAN Clustering')

# Generate a scatter plot with different colors for each cluster
unique_labels = np.unique(labels)
for label in unique_labels:
    if label == -1:
        # HDBSCAN labels noise points as -1
        color = 'black'
        label_name = 'Noise'
    else:
        color = plt.cm.Spectral(float(label) / len(unique_labels))
        label_name = f'Cluster {label}'
    
    plt.scatter(s_umap[labels == label, 0], s_umap[labels == label, 1], 
                c=[color], label=label_name, alpha=0.5, edgecolors='w', s=100)

plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.legend()
plt.show()


ValueError: Input contains NaN.