In [None]:
!uv pip install umap-learn

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import umap
import seaborn as sns
from tqdm import tqdm
import plotly.express as px

#import dask.dataframe as dd
#from dask import delayed, compute
#from dask.diagnostics import ProgressBar
# from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

def load_features(fn):
    return np.load(f'/data/ECG_AF/encoder_9_features_it2_val/{fn}').mean(axis=0).astype(np.float16).tolist()

In [None]:
df = pd.read_csv('/data/ECG_AF/val_self_supervised_processed.csv')

In [None]:
df_select = df #.sample(50000)
#fv = df_select.filename.progress_apply(load_features).values

In [None]:
# Convert the sampled DataFrame to a Dask DataFrame
ddf = dd.from_pandas(df_select, npartitions=32)

# Apply the function in parallel using dask.delayed
fv_delayed = ddf['filename'].apply(lambda fn: delayed(load_features)(fn), meta=('filename', object))

# Compute the results with a progress bar
with ProgressBar():
    fv = compute(*fv_delayed.compute())  # Triggers the computation and returns the results

In [None]:
df_select.to_parquet('df_select.parquet')

In [None]:
fv_arr = np.array(list(fv))

In [None]:
fv_arr.dtype

In [None]:
np.save("fv.npy", fv_arr.astype(np.float16))

---

In [None]:
df_select = pd.read_parquet('df_select.parquet')

In [None]:
df_select.loc[df_select.dataset == 'CPSC-EXTRA','dataset']  = 'CPSC'

In [None]:
fv_arr = np.load('fv.npy')

In [None]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(fv_arr)

In [None]:
ds_order = [
    'CHAPMAN','CPSC', 'CPSC-EXTRA',
    'GEORGIA','HEFEI','Mimic', 'NINGBO',
    'PTB', 'PTB-XL', 'RIBEIRO'  , 'Samitrop']
mapping = {n:i for n,i in zip(ds_order, range(len(ds_order)))}

In [None]:
mapping

'Mimic', 'Samitrop', 'Georgia', 'Ptb', 'Ningbo', 'Ribeiro', 'Hefei','Cpsc', 'Sph', 'Ptbxl'

In [None]:
colors = sns.color_palette("viridis", n_colors=len(ds_order))

In [None]:
plt.figure(figsize=(8, 10))
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=[colors[x] for x in df_select.dataset.map(mapping)],
    s=20
)
#plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection', fontsize=20);
plt.savefig('umap_projection.png', dpi=300)

In [None]:
df_select['embedding0'] = embedding[:,0]
df_select['embedding1'] = embedding[:,1]

fig = px.scatter(
    x=df_select.embedding0, y=df_select.embedding1,
    color=df_select.dataset, 
    labels={'color': 'Dataset'},
    category_orders={'dataset': ds_order},
    color_discrete_sequence=px.colors.sequential.Viridis,
    )
#fig.write_html("umap.html")

In [None]:
fig.show()