In [156]:
import pandas as pd
import numpy as np
from dask import dataframe as dd
from sklearn.metrics.pairwise import cosine_similarity, nan_euclidean_distances
from sklearn.preprocessing import LabelEncoder, normalize

In [114]:
df = pd.read_parquet('../data/processed/ProcessChexpertDf/part.0.parquet')

In [116]:
df_raw = df.copy()

In [117]:
df.drop(columns = ['Path'], inplace = True)

In [118]:
# columns that need encoding
object_cols = df.dtypes[(df.dtypes == object)].index.values

In [119]:
LE = LabelEncoder()

In [120]:
for object_col in object_cols:
    df[object_col] = LE.fit_transform(df[object_col].astype(str))

In [133]:
object_cols

array(['Sex', 'Frontal/Lateral', 'AP/PA'], dtype=object)

In [130]:
def normalize(df,columns):
    result = df.copy()
    for feature_name in columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [None]:
object_cols = df.dtypes[(df.dtypes == object)].index.values
def normalize_my_df():
    df = normalize(df,object_cols)
    df =  normalize(df,['Age'])

In [140]:
df.head()

Unnamed: 0_level_0,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0.0,0.755556,0.0,0.0,1.0,,,,,,,,,0.0,,,,1.0
1,0.0,0.966667,0.0,0.0,,,-1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,
2,0.0,0.922222,0.0,0.0,,,,1.0,,,-1.0,,,,,,1.0,
3,0.0,0.922222,1.0,0.5,,,,1.0,,,-1.0,,,,,,1.0,
4,0.5,0.455556,0.0,0.0,,,,,,1.0,,,,0.0,,,,


In [141]:
# Find a CXR with lots of features
df[df['Lung Lesion']==1].isna().sum(axis=1).idxmin()

37959

In [142]:
assert df[df['Lung Lesion']==1].isna().sum(axis=1).idxmin() == 37959

In [143]:
df.loc[37959]

Sex                           0.500000
Age                           0.666667
Frontal/Lateral               0.000000
AP/PA                         0.000000
No Finding                         NaN
Enlarged Cardiomediastinum    0.000000
Cardiomegaly                       NaN
Lung Opacity                  1.000000
Lung Lesion                   1.000000
Edema                         1.000000
Consolidation                 1.000000
Pneumonia                    -1.000000
Atelectasis                  -1.000000
Pneumothorax                  0.000000
Pleural Effusion              1.000000
Pleural Other                      NaN
Fracture                           NaN
Support Devices               1.000000
Name: 37959, dtype: float64

In [144]:
index_comparator = 37959

In [145]:
n_nanvalues = df.loc[index_comparator].isna().sum()

In [146]:
row_comparator_raw = df.loc[index_comparator]

In [147]:
row_comparator = df.loc[index_comparator].isna()

In [148]:
row_comparator

Sex                           False
Age                           False
Frontal/Lateral               False
AP/PA                         False
No Finding                     True
Enlarged Cardiomediastinum    False
Cardiomegaly                   True
Lung Opacity                  False
Lung Lesion                   False
Edema                         False
Consolidation                 False
Pneumonia                     False
Atelectasis                   False
Pneumothorax                  False
Pleural Effusion              False
Pleural Other                  True
Fracture                       True
Support Devices               False
Name: 37959, dtype: bool

In [149]:
row_comparator.values

array([False, False, False, False,  True, False,  True, False, False,
       False, False, False, False, False, False,  True,  True, False])

In [150]:
df[df.isna().sum(axis=1) == n_nanvalues]

Unnamed: 0_level_0,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
15265,0.5,0.844444,0.0,0.75,,-1.0,-1.0,1.0,,0.0,0.0,,-1.0,0.0,-1.0,-1.0,,0.0
15266,0.5,0.844444,1.0,0.5,,-1.0,-1.0,1.0,,0.0,0.0,,-1.0,0.0,-1.0,-1.0,,0.0
19452,0.5,0.966667,0.0,0.75,,-1.0,1.0,-1.0,,0.0,0.0,,-1.0,0.0,1.0,-1.0,1.0,
19453,0.5,0.966667,1.0,0.5,,-1.0,1.0,-1.0,,0.0,0.0,,-1.0,0.0,1.0,-1.0,1.0,
37959,0.5,0.666667,0.0,0.0,,0.0,,1.0,1.0,1.0,1.0,-1.0,-1.0,0.0,1.0,,,1.0
80312,0.5,0.466667,0.0,0.75,,-1.0,,1.0,,0.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,,1.0
80313,0.5,0.466667,1.0,0.5,,-1.0,,1.0,,0.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,,1.0
116061,0.0,0.711111,0.0,0.75,,0.0,0.0,1.0,0.0,0.0,0.0,,1.0,0.0,-1.0,-1.0,,
116062,0.0,0.711111,1.0,0.5,,0.0,0.0,1.0,0.0,0.0,0.0,,1.0,0.0,-1.0,-1.0,,
144069,0.5,0.688889,0.0,0.0,,1.0,-1.0,1.0,,1.0,1.0,-1.0,-1.0,0.0,1.0,,,1.0


In [151]:
# Find the 100 rows with similar features mentioned (not necessarily the same findings)
similar_features_idx = (df.isna() == row_comparator).sum(1).sort_values(ascending = False)[:100].index

In [152]:
similar_features_idx

Int64Index([ 37959, 152538,  35737, 210244, 145696, 132525,  28404, 113518,
            132526, 219420, 216617, 199087,   4811, 173799, 168283,   4810,
            219889,  35738, 195964,  92004, 136007, 136008,  87096,   6794,
            197859, 188287,  97683, 111571, 137648,  80312, 216046,  80313,
            137647, 197860, 184323, 147229,  97682, 169843, 168271,  42090,
            212838,  69982, 100999, 218450,  44112, 193922, 184511, 175053,
            175056, 189784, 154273, 153679, 211020,  25478, 103732, 199849,
            215835, 215838,  91258, 170707, 166470, 118165, 201499,  54867,
            191612, 154482, 168313, 194979, 113382,  23170, 114162, 144069,
            160623,  34363, 113383, 198671,  53057, 216306, 138084, 206976,
            128621,  69088, 213887, 213883,  24581, 192466, 192465, 207410,
            109698, 153047, 212012,  17419,  15509,  17421,  22580,  18492,
             59042,  59043, 216463,  18954],
           dtype='int64', name='__null_dask

In [153]:
df.loc[similar_features_idx]

Unnamed: 0_level_0,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
37959,0.5,0.666667,0.0,0.00,,0.0,,1.0,1.0,1.0,1.0,-1.0,-1.0,0.0,1.0,,,1.0
152538,0.0,0.711111,0.0,0.00,,-1.0,,,1.0,1.0,1.0,1.0,1.0,0.0,1.0,,,1.0
35737,0.0,0.433333,0.0,0.75,,1.0,,1.0,1.0,-1.0,-1.0,-1.0,-1.0,,1.0,,,1.0
210244,0.0,0.855556,0.0,0.00,,1.0,,1.0,,0.0,-1.0,-1.0,-1.0,0.0,1.0,,,1.0
145696,0.0,0.811111,0.0,0.00,,-1.0,,1.0,,-1.0,-1.0,-1.0,-1.0,0.0,1.0,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18492,0.0,0.800000,0.0,0.00,,-1.0,,1.0,1.0,,,-1.0,-1.0,1.0,1.0,,,1.0
59042,0.5,0.633333,0.0,0.75,,0.0,,1.0,0.0,0.0,0.0,,1.0,0.0,0.0,,,
59043,0.5,0.633333,1.0,0.50,,0.0,,1.0,0.0,0.0,0.0,,1.0,0.0,0.0,,,
216463,0.0,0.733333,0.0,0.00,,-1.0,,1.0,,0.0,,-1.0,-1.0,1.0,1.0,,,1.0


In [164]:
argsorted = nan_euclidean_distances(row_comparator_raw.values.reshape(1,-1), df.loc[similar_features_idx]).argsort()

In [171]:
similar_features_idx[argsorted][0]

  """Entry point for launching an IPython kernel.


array([ 37959,  17419, 160623, 189784, 144069, 173799,  15509, 197860,
       197859,  69088, 216617, 166470,  22580, 168313,  18954, 198671,
       219889,  69982, 193922,  18492, 201499, 211020, 191612,  91258,
       216463, 137647,  23170, 215838, 215835, 137648,  92004, 168283,
       184511, 213887, 213883, 100999, 114162,  54867, 154273, 153679,
       207410, 169843, 147229, 175053, 175056,   4810, 132525, 210244,
       195964,  28404,  17421, 111571,   6794,   4811, 132526, 103732,
        25478, 154482,  24581,  44112, 136007,  34363, 206976, 199849,
        87096, 188287,  42090, 219420, 136008, 212838, 216306, 170707,
       212012, 216046, 192465, 152538, 145696,  59042,  97682,  35737,
        59043,  97683,  53057, 168271, 192466,  35738, 184323, 153047,
        80312,  80313, 218450, 194979, 128621, 118165, 113518, 109698,
       113382, 138084, 113383, 199087], dtype=int64)

In [173]:
df_raw.loc[similar_features_idx[argsorted][0]]

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
37959,CheXpert-v1.0-small/train/patient09313/study1/...,Male,60,Frontal,AP,,0.0,,1.0,1.0,1.0,1.0,-1.0,-1.0,0.0,1.0,,,1.0
17419,CheXpert-v1.0-small/train/patient04352/study4/...,Male,60,Frontal,AP,,0.0,,,,1.0,1.0,-1.0,-1.0,0.0,1.0,,,1.0
160623,CheXpert-v1.0-small/train/patient37548/study7/...,Male,49,Frontal,AP,,,,1.0,1.0,1.0,1.0,-1.0,,0.0,1.0,,,1.0
189784,CheXpert-v1.0-small/train/patient45426/study5/...,Female,65,Frontal,AP,,0.0,,1.0,,1.0,,-1.0,-1.0,0.0,1.0,,,1.0
144069,CheXpert-v1.0-small/train/patient34615/study19...,Male,62,Frontal,AP,,1.0,-1.0,1.0,,1.0,1.0,-1.0,-1.0,0.0,1.0,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109698,CheXpert-v1.0-small/train/patient26341/study13...,Female,49,Frontal,AP,,-1.0,,1.0,,-1.0,,-1.0,1.0,0.0,-1.0,,,1.0
113382,CheXpert-v1.0-small/train/patient27192/study1/...,Male,68,Frontal,PA,,0.0,,1.0,,,-1.0,1.0,1.0,1.0,1.0,,,1.0
138084,CheXpert-v1.0-small/train/patient33155/study17...,Female,64,Frontal,AP,,-1.0,,1.0,,-1.0,-1.0,,1.0,1.0,1.0,,,1.0
113383,CheXpert-v1.0-small/train/patient27192/study1/...,Male,68,Lateral,,,0.0,,1.0,,,-1.0,1.0,1.0,1.0,1.0,,,1.0
