In [1]:
from itertools import product

import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler

from umap import UMAP

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
df = pd.read_csv('data/hf_patients_data.csv')
df.drop(columns=['id_global_pat'], inplace=True)
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73449 entries, 0 to 73449
Data columns (total 22 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   los       73449 non-null  int64  
 1   Provider  73449 non-null  object 
 2   female    73449 non-null  float64
 3   age       73449 non-null  int64  
 4   comI10    73449 non-null  int64  
 5   comI20    73449 non-null  int64  
 6   comI25    73449 non-null  int64  
 7   comI48    73449 non-null  int64  
 8   comI60    73449 non-null  int64  
 9   comJ44    73449 non-null  int64  
 10  comE10    73449 non-null  int64  
 11  comN18    73449 non-null  int64  
 12  comE66    73449 non-null  int64  
 13  comI21    73449 non-null  int64  
 14  comI05    73449 non-null  int64  
 15  comI34    73449 non-null  int64  
 16  comI42    73449 non-null  int64  
 17  comN17    73449 non-null  int64  
 18  comC00    73449 non-null  int64  
 19  comI60_9  73449 non-null  int64  
 20  comF00    73449 non-null  int64  

In [4]:
# Для тестов
# df = df.iloc[:100, :]
# df.info()

In [5]:
num_cols = list(df.select_dtypes(include='number').columns)
num_cols

['los',
 'female',
 'age',
 'comI10',
 'comI20',
 'comI25',
 'comI48',
 'comI60',
 'comJ44',
 'comE10',
 'comN18',
 'comE66',
 'comI21',
 'comI05',
 'comI34',
 'comI42',
 'comN17',
 'comC00',
 'comI60_9',
 'comF00',
 'comU07']

In [6]:
def draw_umap(df: pd.DataFrame, metric: str = 'euclidean', min_dist: float = 0.1, n_neighbors: int = 5):
    '''
        f
    '''

    # Отмасштабируем данные
    scaler = StandardScaler().set_output(transform='pandas')
    df_scaled = scaler.fit_transform(df[num_cols])
    df_scaled.head()

    sex = df['female'].astype('category')

    umap = UMAP(metric=metric, n_neighbors=n_neighbors, min_dist=min_dist, n_components=2, 
                random_state=42, n_jobs=1)
    embedding = umap.fit_transform(df_scaled)
    df['x'] = embedding[:, 0]
    df['y'] = embedding[:, 1]
    
    plot_title = f'UMAP with metric: {metric}, min_dist: {min_dist}, n_neighbors: {n_neighbors}'

    fig = px.scatter(df, 'x', 'y', color=sex, labels={'color': 'female'}, title=plot_title,
                     width=1200, height=1000, template='simple_white', 
                     hover_data=num_cols)
    fig.update_layout(margin=dict(l=50, r=50, t=50, b=50), paper_bgcolor="White")
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
    
    fig.write_html(f'./umap_plots/UMAP_metric_{metric}_min-dist_{min_dist}_n-neighbors_{n_neighbors}.html')

In [7]:
metric_list = ['cosine', 'hamming', 'jaccard']
min_dist_list = [0.1, 0.2, 0.5]
n_neighbors_list = [25, 50, 100, 150]

params = list(product(metric_list, min_dist_list, n_neighbors_list))

for metric, min_dist, n_neighbors in tqdm(params, total=len(params)):
    draw_umap(df, metric=metric, min_dist=min_dist, n_neighbors=n_neighbors)

  0%|          | 0/36 [00:00<?, ?it/s]

100%|██████████| 36/36 [2:53:51<00:00, 289.75s/it]  


Визуально выбрали для кластеризации представления данных в 2D при помощи Umap с гиперпараметрами:
1) metric='hamming', min_dist=0.2 и n_neighbors=150
2) metric='hamming', min_dist=0.5 и n_neighbors=25