In [1]:
from itertools import product

import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

from umap import UMAP

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

In [3]:
df = pd.read_csv('data/hf_patients_data.csv')
id_global_path = df['id_global_pat']
df.drop(columns=['id_global_pat'], inplace=True)
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73449 entries, 0 to 73449
Data columns (total 22 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   los       73449 non-null  int64  
 1   Provider  73449 non-null  object 
 2   female    73449 non-null  float64
 3   age       73449 non-null  int64  
 4   comI10    73449 non-null  int64  
 5   comI20    73449 non-null  int64  
 6   comI25    73449 non-null  int64  
 7   comI48    73449 non-null  int64  
 8   comI60    73449 non-null  int64  
 9   comJ44    73449 non-null  int64  
 10  comE10    73449 non-null  int64  
 11  comN18    73449 non-null  int64  
 12  comE66    73449 non-null  int64  
 13  comI21    73449 non-null  int64  
 14  comI05    73449 non-null  int64  
 15  comI34    73449 non-null  int64  
 16  comI42    73449 non-null  int64  
 17  comN17    73449 non-null  int64  
 18  comC00    73449 non-null  int64  
 19  comI60_9  73449 non-null  int64  
 20  comF00    73449 non-null  int64  

In [4]:
# Для тестов
# df = df.iloc[:100, :]
# df.info()

In [5]:
num_cols = list(df.select_dtypes(include='number').columns)
num_cols

['los',
 'female',
 'age',
 'comI10',
 'comI20',
 'comI25',
 'comI48',
 'comI60',
 'comJ44',
 'comE10',
 'comN18',
 'comE66',
 'comI21',
 'comI05',
 'comI34',
 'comI42',
 'comN17',
 'comC00',
 'comI60_9',
 'comF00',
 'comU07']

In [6]:
# Отмасштабируем данные
scaler = StandardScaler().set_output(transform='pandas')
df_scaled = scaler.fit_transform(df[num_cols])

# Преобразуем пол в категориальный тип данных
df_scaled.head()

Unnamed: 0,los,female,age,comI10,comI20,comI25,comI48,comI60,comJ44,comE10,...,comE66,comI21,comI05,comI34,comI42,comN17,comC00,comI60_9,comF00,comU07
0,2.011169,-1.2022,0.642192,-2.002283,-2.022729,-0.406233,-0.588519,-0.238338,-0.346668,-0.553816,...,-0.350745,-0.229434,-0.101228,-0.192177,-0.183277,-0.060967,-0.364021,-1.354256,-0.020877,-0.560509
1,-0.914888,-1.2022,1.633415,0.49943,0.494382,-0.406233,-0.588519,-0.238338,-0.346668,-0.553816,...,-0.350745,-0.229434,-0.101228,-0.192177,-0.183277,-0.060967,-0.364021,0.738413,-0.020877,-0.560509
2,1.150564,0.831809,1.303007,-2.002283,0.494382,-0.406233,-0.588519,-0.238338,-0.346668,1.805653,...,-0.350745,-0.229434,-0.101228,-0.192177,-0.183277,-0.060967,-0.364021,0.738413,-0.020877,-0.560509
3,0.634201,0.831809,-0.018624,0.49943,0.494382,-0.406233,1.699182,-0.238338,2.884604,-0.553816,...,-0.350745,-0.229434,-0.101228,5.203541,-0.183277,-0.060967,-0.364021,0.738413,-0.020877,-0.560509
4,-0.914888,-1.2022,0.229182,0.49943,0.494382,-0.406233,-0.588519,-0.238338,-0.346668,-0.553816,...,-0.350745,-0.229434,-0.101228,-0.192177,-0.183277,-0.060967,-0.364021,-1.354256,-0.020877,-0.560509


In [7]:
# Подготовим столбцы с координатами UMAP для двух вариантов понижения размерности

umap = UMAP(metric='hamming', n_neighbors=150, min_dist=0.2, n_components=2, 
                random_state=42, n_jobs=1)
embedding = umap.fit_transform(df_scaled)
df['x1'] = embedding[:, 0]
df['y1'] = embedding[:, 1]

umap = UMAP(metric='hamming', n_neighbors=25, min_dist=0.5, n_components=2, 
                random_state=42, n_jobs=1)
embedding = umap.fit_transform(df_scaled)
df['x2'] = embedding[:, 0]
df['y2'] = embedding[:, 1]

In [8]:
def draw_clusters_original_data(eps, min_samples, metric, log_dict):
    '''
        f
    '''

    # Copy of original dataset
    df_cp = df.copy()

    # Clustering
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, n_jobs=-1)
    dbscan.fit(df_scaled)
    df_cp['cluster'] = dbscan.labels_
    df_cp['cluster'] = df_cp['cluster'].astype('str')
    
    key = f'DBSCAN on scaled data with eps={eps}, min_samples={min_samples}, metric={metric}'

    try:
        sil = silhouette_score(df_scaled, dbscan.labels_)
        chs = calinski_harabasz_score(df_scaled, dbscan.labels_)
        dbs = davies_bouldin_score(df_scaled, dbscan.labels_)

        sil = round(sil, 4)   # Silhoulette
        chs = round(chs, 4)    # Calinski Harabasz score
        dbs = round(dbs, 4)    # Davies Bouldin score
    except Exception as e:
        sil = 'Error'
        chs = 'Error'
        dbs = 'Error'

    log_dict[key] = (sil, chs, dbs)

    # First Umap plot
    plot_title = f'First Umap plot with DBSCAN with eps={eps}, min_samples={min_samples}, metric={metric}'

    fig = px.scatter(df_cp, 'x1', 'y1', color='cluster', labels={'color': 'female'}, title=plot_title,
                    width=1200, height=1000, template='simple_white', 
                    hover_data=num_cols)
    fig.update_layout(margin=dict(l=50, r=50, t=50, b=50), paper_bgcolor="White")
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True)

    fig.write_html(f'./dbscan_plots/original_data/First_UMAP_DBSCAN_eps_{eps}_min_samples_{min_samples}_metric_{metric}.html')

    # Second Umap plot
    plot_title = f'Second Umap plot with DBSCAN with eps={eps}, min_samples={min_samples}, metric={metric}'

    fig = px.scatter(df_cp, 'x2', 'y2', color='cluster', labels={'color': 'female'}, title=plot_title,
                    width=1200, height=1000, template='simple_white', 
                    hover_data=num_cols)
    fig.update_layout(margin=dict(l=50, r=50, t=50, b=50), paper_bgcolor="White")
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True)

    fig.write_html(f'./dbscan_plots/original_data/Second_UMAP_DBSCAN_eps_{eps}_min_samples_{min_samples}_metric_{metric}.html')

In [9]:
def draw_clusters_umapped_data(eps, min_samples, metric, log_dict):
    '''
        f
    '''

    # Copy of original dataset
    df_cp = df.copy()

    # Clustering 1
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, n_jobs=-1)
    dbscan.fit(df_cp[['x1', 'y1']])
    df_cp['cluster'] = dbscan.labels_
    df_cp['cluster'] = df_cp['cluster'].astype('str')

    key = f'DBSCAN on Umapped coords 1 data with eps={eps}, min_samples={min_samples}, metric={metric}'
    
    try:
        sil = silhouette_score(df_cp[['x1', 'y1']], dbscan.labels_)
        chs = calinski_harabasz_score(df_cp[['x1', 'y1']], dbscan.labels_)
        dbs = davies_bouldin_score(df_cp[['x1', 'y1']], dbscan.labels_)

        sil = round(sil, 4)   # Silhoulette
        chs = round(chs, 4)    # Calinski Harabasz score
        dbs = round(dbs, 4)    # Davies Bouldin score
    except Exception as e:
        sil = 'Error'
        chs = 'Error'
        dbs = 'Error'

    log_dict[key] = (sil, chs, dbs)

    # First Umap plot
    plot_title = f'First Umap plot with DBSCAN with eps={eps}, min_samples={min_samples}, metric={metric}'

    fig = px.scatter(df_cp, 'x1', 'y1', color='cluster', labels={'color': 'female'}, title=plot_title,
                    width=1200, height=1000, template='simple_white', 
                    hover_data=num_cols)
    fig.update_layout(margin=dict(l=50, r=50, t=50, b=50), paper_bgcolor="White")
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True)

    fig.write_html(f'./dbscan_plots/umapped_data/First_UMAP_DBSCAN_eps_{eps}_min_samples_{min_samples}_metric_{metric}.html')


    # Clustering 2
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, n_jobs=-1)
    dbscan.fit(df_cp[['x2', 'y2']])
    df_cp['cluster'] = dbscan.labels_
    df_cp['cluster'] = df_cp['cluster'].astype('str')

    key = f'DBSCAN on Umapped coords 2 data with eps={eps}, min_samples={min_samples}, metric={metric}'
    try:
        sil = silhouette_score(df_cp[['x2', 'y2']], dbscan.labels_)
        chs = calinski_harabasz_score(df_cp[['x2', 'y2']], dbscan.labels_)
        dbs = davies_bouldin_score(df_cp[['x2', 'y2']], dbscan.labels_)

        sil = round(sil, 4)   # Silhoulette
        chs = round(chs, 4)    # Calinski Harabasz score
        dbs = round(dbs, 4)    # Davies Bouldin score
    except Exception as e:
        sil = 'Error'
        chs = 'Error'
        dbs = 'Error'

    log_dict[key] = (sil, chs, dbs)

    # Second Umap plot
    plot_title = f'Second Umap plot with DBSCAN with eps={eps}, min_samples={min_samples}, metric={metric}'

    fig = px.scatter(df_cp, 'x2', 'y2', color='cluster', labels={'color': 'female'}, title=plot_title,
                    width=1200, height=1000, template='simple_white', 
                    hover_data=num_cols)
    fig.update_layout(margin=dict(l=50, r=50, t=50, b=50), paper_bgcolor="White")
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True)

    fig.write_html(f'./dbscan_plots/umapped_data/Second_UMAP_DBSCAN_eps_{eps}_min_samples_{min_samples}_metric_{metric}.html')

In [10]:
eps_list = [0.01, 0.05, 0.1]
min_samples_list = [10, 25, 42]
metric_list = ['cosine', 'euclidean']
log_dict = {}

params = list(product(eps_list, min_samples_list, metric_list))

for eps, min_samples, metric in tqdm(params, total=len(params)):
    draw_clusters_original_data(eps, min_samples, metric, log_dict)

100%|██████████| 18/18 [10:07<00:00, 33.75s/it]


In [11]:
df_metrics = pd.DataFrame(log_dict).T.rename(columns={0: 'Silhoulette score', 1: 'Calinski Harabasz score', 2: 'Davies Bouldin score'})
df_metrics.to_excel('./dbscan_plots/dbscan_original_data_metrics.xlsx')

In [12]:
eps_list = [0.0001, 0.001, 0.01]
min_samples_list = [2, 4, 8, 10]
metric_list = ['cosine', 'euclidean']
log_dict = {}

params = list(product(eps_list, min_samples_list, metric_list))

for eps, min_samples, metric in tqdm(params, total=len(params)):
    draw_clusters_umapped_data(eps, min_samples, metric, log_dict)

100%|██████████| 24/24 [20:43<00:00, 51.80s/it]


In [13]:
df_metrics = pd.DataFrame(log_dict).T.rename(columns={0: 'Silhoulette score', 1: 'Calinski Harabasz score', 2: 'Davies Bouldin score'})
df_metrics.to_excel('./dbscan_plots/dbscan_umapped_data_metrics.xlsx')

In [6]:
clusters_df = pd.read_excel('clusters.xlsx')
clusters_df['id_global_pat'] = id_global_path
clusters_df.head()

Unnamed: 0,los,Provider,female,age,comI10,comI20,comI25,comI48,comI60,comJ44,...,comI42,comN17,comC00,comI60_9,comF00,comU07,kmeans_5,dbscan_5,dbscan_7_with_outliers,id_global_pat
0,22,"СПб ГБУЗ ""Госпиталь для ветеранов войн""",0,82,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0b17c733-719e-47f6-844e-c081933f9a0d
1,5,"СПб ГБУЗ ""Городская Покровская больница""",0,94,1,1,0,0,0,0,...,0,0,0,1,0,0,1,1,1,4cfffa07-01e8-4805-939a-1b94bf26c437
2,17,"СПб ГБУЗ ""Госпиталь для ветеранов войн""",1,90,0,1,0,0,0,0,...,0,0,0,1,0,0,3,2,2,c9b4803d-71f2-4675-aa3f-8f7cb7accc1c
3,14,"СПб ГБУЗ ""Городская Покровская больница""",1,74,1,1,0,1,0,1,...,0,0,0,1,0,0,2,3,3,1a203e53-b076-4a18-a377-cd8e833230a6
4,5,"СПб ГБУЗ ""Городская Покровская больница""",0,77,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,1,d195dce1-a9a7-4743-b931-214e76817a16


In [18]:
dbscan = DBSCAN(eps=0.01, min_samples=10, metric='cosine', n_jobs=-1)
dbscan.fit(df[['x1', 'y1']])
clusters_df['dbscan_5'] = dbscan.labels_

dbscan = DBSCAN(eps=0.001, min_samples=10, metric='cosine', n_jobs=-1)
dbscan.fit(df[['x1', 'y1']])
clusters_df['dbscan_7_with_outliers'] = dbscan.labels_

In [19]:
clusters_df.head()

Unnamed: 0,los,Provider,female,age,comI10,comI20,comI25,comI48,comI60,comJ44,...,comI34,comI42,comN17,comC00,comI60_9,comF00,comU07,kmeans_5,dbscan_5,dbscan_7_with_outliers
0,22,"СПб ГБУЗ ""Госпиталь для ветеранов войн""",0,82,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,"СПб ГБУЗ ""Городская Покровская больница""",0,94,1,1,0,0,0,0,...,0,0,0,0,1,0,0,1,1,1
2,17,"СПб ГБУЗ ""Госпиталь для ветеранов войн""",1,90,0,1,0,0,0,0,...,0,0,0,0,1,0,0,3,2,2
3,14,"СПб ГБУЗ ""Городская Покровская больница""",1,74,1,1,0,1,0,1,...,1,0,0,0,1,0,0,2,3,3
4,5,"СПб ГБУЗ ""Городская Покровская больница""",0,77,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1


In [None]:
# clusters_df.to_excel('clusters.xlsx')