# Malignancy Data Exploration


In [1]:
import pandas as pd

from luna16.settings import settings

%matplotlib inline

## Malignant Annotation file exploration

The `annotations_with_malignancy.csv` file contains information ???

In [3]:
annotations_malignancy_path = settings.DATA_DIR / "annotations_with_malignancy.csv"
annotations_malignancy = pd.read_csv(filepath_or_buffer=annotations_malignancy_path)
annotations_malignancy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1182 entries, 0 to 1181
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   seriesuid          1182 non-null   object 
 1   coord_x            1182 non-null   float64
 2   coord_y            1182 non-null   float64
 3   coord_z            1182 non-null   float64
 4   diameter_mm        1182 non-null   float64
 5   is_malignant       1182 non-null   bool   
 6   malignant_details  1182 non-null   object 
 7   bboxLowX           1182 non-null   float64
 8   bboxLowY           1182 non-null   float64
 9   bboxLowZ           1182 non-null   float64
 10  bboxHighX          1182 non-null   float64
 11  bboxHighY          1182 non-null   float64
 12  bboxHighZ          1182 non-null   float64
 13  len_mal_details    1182 non-null   int64  
dtypes: bool(1), float64(10), int64(1), object(2)
memory usage: 121.3+ KB


In [4]:
annotations_malignancy.head()

Unnamed: 0,seriesuid,coord_x,coord_y,coord_z,diameter_mm,is_malignant,malignant_details,bboxLowX,bboxLowY,bboxLowZ,bboxHighX,bboxHighY,bboxHighZ,len_mal_details
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-128.699421,-175.319272,-298.387506,5.651471,True,"[4, 2, 4, 2]",-131.89648,-178.259761,-299.800004,-125.451167,-172.45898,-296.200004,4
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.265775376735...,-69.175741,218.052625,-668.501273,4.4412,False,"[3, 2, 3]",-71.815939,215.70982,-670.900024,-66.722189,220.80357,-666.900024,3
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.276556509002...,-67.459503,-247.744211,-211.09534,7.744222,True,"[5, 3, 4]",-73.027344,-253.847656,-213.100004,-61.074219,-242.558594,-209.500004,3
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.276556509002...,-97.690876,-134.068054,-187.094883,6.690036,False,"[3, 3, 3]",-102.910156,-137.636719,-189.700005,-92.949219,-130.332031,-186.100005,3
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.276710697414...,69.529141,87.036153,-149.190113,10.824515,False,"[3, 3, 5, 2]",63.731247,80.46875,-154.0,76.231247,93.75,-144.0,4


In [5]:
n_annotations_malignancy = len(annotations_malignancy)
n_unique_annotations_malignancy_seriesuids = len(
    annotations_malignancy.seriesuid.unique()
)
print(
    f"Annotations with malignancy dataframe has {n_annotations_malignancy} rows and {n_unique_annotations_malignancy_seriesuids} unique series uid values."
)

Annotations with malignancy dataframe has 1182 rows and 600 unique series uid values.


In [6]:
series_uid_values_count = annotations_malignancy["seriesuid"].value_counts()
n_same_seriesuid_mean_for_annotations_malignancy = series_uid_values_count.mean()
print(
    f"For {len(annotations_malignancy)} number of annotations with malignancy, the average number "
    f"of rows with same seriesuid is {n_same_seriesuid_mean_for_annotations_malignancy:.2f}."
)
series_uid_values_count

For 1182 number of annotations with malignancy, the average number of rows with same seriesuid is 1.97.


seriesuid
1.3.6.1.4.1.14519.5.2.1.6279.6001.176030616406569931557298712518    12
1.3.6.1.4.1.14519.5.2.1.6279.6001.219428004988664846407984058588     9
1.3.6.1.4.1.14519.5.2.1.6279.6001.195557219224169985110295082004     9
1.3.6.1.4.1.14519.5.2.1.6279.6001.202187810895588720702176009630     9
1.3.6.1.4.1.14519.5.2.1.6279.6001.328789598898469177563438457842     9
                                                                    ..
1.3.6.1.4.1.14519.5.2.1.6279.6001.321465552859463184018938648244     1
1.3.6.1.4.1.14519.5.2.1.6279.6001.320111824803959660037459294083     1
1.3.6.1.4.1.14519.5.2.1.6279.6001.317087518531899043292346860596     1
1.3.6.1.4.1.14519.5.2.1.6279.6001.316911475886263032009840828684     1
1.3.6.1.4.1.14519.5.2.1.6279.6001.174168737938619557573021395302     1
Name: count, Length: 600, dtype: int64

In [7]:
annotations_malignancy["is_malignant"].value_counts()

is_malignant
False    804
True     378
Name: count, dtype: int64

## Candidates File Exploration

In [31]:
candidates_path = settings.DATA_DIR / "candidates.csv"
candidates_v2_path = settings.DATA_DIR / "candidates_v2.csv"
candidates = pd.read_csv(filepath_or_buffer=candidates_v2_path)
candidates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 754975 entries, 0 to 754974
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   seriesuid  754975 non-null  object 
 1   coordX     754975 non-null  float64
 2   coordY     754975 non-null  float64
 3   coordZ     754975 non-null  float64
 4   class      754975 non-null  int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 28.8+ MB


In [41]:
n_is_nodules = len(candidates[candidates["class"] == 1])
n_not_nodules = len(candidates[candidates["class"] == 0])
percentage_of_malignant = (100 * n_is_nodules) / len(candidates)

print(f"LuNA 16 contains {n_is_nodules} nodules and {n_not_nodules} not nodules.")
print(f"Nodules are {percentage_of_malignant:.2f}%.")

candidates["class"].value_counts()

LuNA 16 contains 1557 nodules and 753418 not nodules.
Nodules are 0.21%.


class
0    753418
1      1557
Name: count, dtype: int64

In [42]:
n_candidates = len(candidates)
n_unique_candidates_seriesuids = len(candidates.seriesuid.unique())
print(
    f"Candidates dataframe has {n_candidates} rows and {n_unique_candidates_seriesuids} unique series uid values."
)

Candidates dataframe has 754975 rows and 888 unique series uid values.


In [43]:
series_uid_values_count = candidates["seriesuid"].value_counts()
n_same_seriesuid_mean_for_candidates = series_uid_values_count.mean()
print(
    f"For {len(candidates)} number of candidates, the average number "
    f"of rows with same seriesuid is {n_same_seriesuid_mean_for_candidates:.2f}."
)
series_uid_values_count

For 754975 number of candidates, the average number of rows with same seriesuid is 850.20.


seriesuid
1.3.6.1.4.1.14519.5.2.1.6279.6001.652347820272212119124022644822    2066
1.3.6.1.4.1.14519.5.2.1.6279.6001.167237290696350215427953159586    1973
1.3.6.1.4.1.14519.5.2.1.6279.6001.168737928729363683423228050295    1828
1.3.6.1.4.1.14519.5.2.1.6279.6001.241083615484551649610616348856    1723
1.3.6.1.4.1.14519.5.2.1.6279.6001.200725988589959521302320481687    1713
                                                                    ... 
1.3.6.1.4.1.14519.5.2.1.6279.6001.219349715895470349269596532320     215
1.3.6.1.4.1.14519.5.2.1.6279.6001.333319057944372470283038483725     208
1.3.6.1.4.1.14519.5.2.1.6279.6001.153536305742006952753134773630     207
1.3.6.1.4.1.14519.5.2.1.6279.6001.608029415915051219877530734559     185
1.3.6.1.4.1.14519.5.2.1.6279.6001.397202838387416555106806022938      66
Name: count, Length: 888, dtype: int64

# Concat Annotations  and Candidates

In [44]:
annotations_malignancy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1182 entries, 0 to 1181
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   seriesuid          1182 non-null   object 
 1   coord_x            1182 non-null   float64
 2   coord_y            1182 non-null   float64
 3   coord_z            1182 non-null   float64
 4   diameter_mm        1182 non-null   float64
 5   is_malignant       1182 non-null   bool   
 6   malignant_details  1182 non-null   object 
 7   bboxLowX           1182 non-null   float64
 8   bboxLowY           1182 non-null   float64
 9   bboxLowZ           1182 non-null   float64
 10  bboxHighX          1182 non-null   float64
 11  bboxHighY          1182 non-null   float64
 12  bboxHighZ          1182 non-null   float64
 13  len_mal_details    1182 non-null   int64  
 14  is_nodule          1182 non-null   bool   
 15  is_annotated       1182 non-null   bool   
dtypes: bool(3), float64(10),

In [45]:
candidates["diameter_mm"] = 0.0
candidates["is_nodule"] = False
candidates["is_annotated"] = False
candidates["is_malignant"] = False
candidates.rename(
    columns={"coordX": "coord_x", "coordY": "coord_y", "coordZ": "coord_z"},
    inplace=True,
)
not_nodule_candidates = candidates[
    [
        "seriesuid",
        "diameter_mm",
        "is_nodule",
        "is_annotated",
        "is_malignant",
        "coord_x",
        "coord_y",
        "coord_z",
    ]
]
not_nodule_candidates

Unnamed: 0,seriesuid,diameter_mm,is_nodule,is_annotated,is_malignant,coord_x,coord_y,coord_z
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,0.0,False,False,False,68.420000,-74.480000,-288.700000
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,0.0,False,False,False,-95.209361,-91.809406,-377.426350
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,0.0,False,False,False,-24.766755,-120.379294,-273.361539
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,0.0,False,False,False,-63.080000,-65.740000,-344.240000
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,0.0,False,False,False,52.946688,-92.688873,-241.067872
...,...,...,...,...,...,...,...,...
754970,1.3.6.1.4.1.14519.5.2.1.6279.6001.997611074084...,0.0,False,False,False,-33.400000,-64.200000,-115.560000
754971,1.3.6.1.4.1.14519.5.2.1.6279.6001.997611074084...,0.0,False,False,False,56.236359,70.352400,-203.446236
754972,1.3.6.1.4.1.14519.5.2.1.6279.6001.997611074084...,0.0,False,False,False,-97.104221,55.738289,-203.879785
754973,1.3.6.1.4.1.14519.5.2.1.6279.6001.997611074084...,0.0,False,False,False,-65.470000,59.670000,-136.370000


In [46]:
annotations_malignancy["is_nodule"] = True
annotations_malignancy["is_annotated"] = True
nodule_annotations_malignancy = annotations_malignancy[
    [
        "seriesuid",
        "diameter_mm",
        "is_nodule",
        "is_annotated",
        "is_malignant",
        "coord_x",
        "coord_y",
        "coord_z",
    ]
]
nodule_annotations_malignancy

Unnamed: 0,seriesuid,diameter_mm,is_nodule,is_annotated,is_malignant,coord_x,coord_y,coord_z
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,5.651471,True,True,True,-128.699421,-175.319272,-298.387506
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.265775376735...,4.441200,True,True,False,-69.175741,218.052625,-668.501273
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.276556509002...,7.744222,True,True,True,-67.459503,-247.744211,-211.095340
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.276556509002...,6.690036,True,True,False,-97.690876,-134.068054,-187.094883
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.276710697414...,10.824515,True,True,False,69.529141,87.036153,-149.190113
...,...,...,...,...,...,...,...,...
1177,1.3.6.1.4.1.14519.5.2.1.6279.6001.299767339686...,11.801797,True,True,False,-57.725518,73.779494,-147.415580
1178,1.3.6.1.4.1.14519.5.2.1.6279.6001.299767339686...,4.087102,True,True,False,-72.287990,-77.481968,-160.486254
1179,1.3.6.1.4.1.14519.5.2.1.6279.6001.299767339686...,6.326436,True,True,False,96.625954,-46.154823,-88.229432
1180,1.3.6.1.4.1.14519.5.2.1.6279.6001.297433269262...,10.759349,True,True,False,33.104795,-63.116434,-108.725571


In [47]:
complete_candidates = pd.concat([not_nodule_candidates, nodule_annotations_malignancy])
complete_candidates.info()

<class 'pandas.core.frame.DataFrame'>
Index: 756157 entries, 0 to 1181
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   seriesuid     756157 non-null  object 
 1   diameter_mm   756157 non-null  float64
 2   is_nodule     756157 non-null  bool   
 3   is_annotated  756157 non-null  bool   
 4   is_malignant  756157 non-null  bool   
 5   coord_x       756157 non-null  float64
 6   coord_y       756157 non-null  float64
 7   coord_z       756157 non-null  float64
dtypes: bool(3), float64(4), object(1)
memory usage: 36.8+ MB


In [48]:
complete_candidates.value_counts(["is_nodule"])

is_nodule
False        754975
True           1182
Name: count, dtype: int64

In [49]:
complete_candidates.value_counts(["is_annotated"])

is_annotated
False           754975
True              1182
Name: count, dtype: int64

In [50]:
complete_candidates.value_counts(["is_malignant"])

is_malignant
False           755779
True               378
Name: count, dtype: int64

In [51]:
diameter_col = complete_candidates["diameter_mm"]
diameter_col.describe()

count    756157.000000
mean          0.012966
std           0.377756
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          32.270030
Name: diameter_mm, dtype: float64

In [52]:
series_uid_values_count = complete_candidates.value_counts("seriesuid")
n_same_seriesuid_mean_for_complete_candidates = series_uid_values_count.mean()
print(
    f"For {len(complete_candidates)} number of complete candidates, the average number "
    f"of rows with same seriesuid is {n_same_seriesuid_mean_for_complete_candidates:.2f}."
)
series_uid_values_count

For 756157 number of complete candidates, the average number of rows with same seriesuid is 851.53.


seriesuid
1.3.6.1.4.1.14519.5.2.1.6279.6001.652347820272212119124022644822    2067
1.3.6.1.4.1.14519.5.2.1.6279.6001.167237290696350215427953159586    1975
1.3.6.1.4.1.14519.5.2.1.6279.6001.168737928729363683423228050295    1829
1.3.6.1.4.1.14519.5.2.1.6279.6001.241083615484551649610616348856    1724
1.3.6.1.4.1.14519.5.2.1.6279.6001.200725988589959521302320481687    1715
                                                                    ... 
1.3.6.1.4.1.14519.5.2.1.6279.6001.219349715895470349269596532320     216
1.3.6.1.4.1.14519.5.2.1.6279.6001.153536305742006952753134773630     212
1.3.6.1.4.1.14519.5.2.1.6279.6001.333319057944372470283038483725     208
1.3.6.1.4.1.14519.5.2.1.6279.6001.608029415915051219877530734559     185
1.3.6.1.4.1.14519.5.2.1.6279.6001.397202838387416555106806022938      66
Name: count, Length: 888, dtype: int64

In [55]:
complete_candidates_path = settings.DATA_DIR / "complete_candidates.csv"
complete_candidates.to_csv(complete_candidates_path, index=False)