In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
candidates = pd.read_csv("data/candidates.csv")

In [3]:
candidates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551065 entries, 0 to 551064
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   seriesuid  551065 non-null  object 
 1   coordX     551065 non-null  float64
 2   coordY     551065 non-null  float64
 3   coordZ     551065 non-null  float64
 4   class      551065 non-null  int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 21.0+ MB


In [4]:
candidates.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-56.08,-67.85,-311.92,0
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,53.21,-244.41,-245.17,0
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,103.66,-121.8,-286.62,0
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-33.66,-72.75,-308.41,0
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-32.25,-85.36,-362.51,0


In [5]:
candidates["class"].unique()

array([0, 1])

In [6]:
candidates[candidates["class"] == 1]

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class
13,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,104.164804,-211.685591,-227.011364,1
78,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-128.940000,-175.040000,-297.870000,1
1303,1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793...,70.190000,-140.930000,877.680000,1
3050,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,89.320000,190.840000,-516.820000,1
3052,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,94.930000,153.030000,-429.910000,1
...,...,...,...,...,...
548674,1.3.6.1.4.1.14519.5.2.1.6279.6001.979083010707...,-102.840925,57.880915,-124.815520,1
550171,1.3.6.1.4.1.14519.5.2.1.6279.6001.994459772950...,-161.330000,-29.090000,-269.290000,1
550334,1.3.6.1.4.1.14519.5.2.1.6279.6001.997611074084...,-23.719297,31.714489,-156.172233,1
550810,1.3.6.1.4.1.14519.5.2.1.6279.6001.997611074084...,43.129934,74.326554,-200.283918,1


In [7]:
annotations = pd.read_csv("data/annotations.csv")

In [8]:
annotations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1186 entries, 0 to 1185
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   seriesuid    1186 non-null   object 
 1   coordX       1186 non-null   float64
 2   coordY       1186 non-null   float64
 3   coordZ       1186 non-null   float64
 4   diameter_mm  1186 non-null   float64
dtypes: float64(4), object(1)
memory usage: 46.5+ KB


In [9]:
annotations.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,diameter_mm
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-128.699421,-175.319272,-298.387506,5.651471
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,103.783651,-211.925149,-227.12125,4.224708
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793...,69.639017,-140.944586,876.374496,5.786348
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,-24.013824,192.102405,-391.081276,8.143262
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,2.441547,172.464881,-405.493732,18.54515


In [10]:
from collections import namedtuple

In [11]:
CandidateInfoTuple = namedtuple(
    "CandidateInfoTuple", "isNodule_bool, diameter_mm, series_uid, center_xyz"
)

CandidateInfoTuple._fields

('isNodule_bool', 'diameter_mm', 'series_uid', 'center_xyz')

In [12]:
import os
import csv
import glob
import functools

In [13]:
@functools.lru_cache(1)
def getCandidateInfoList(requireOnDisk_bool=True):
    mhd_list = glob.glob("data/subset*/*.mhd")
    presentOnDisk_set = {os.path.split(p)[-1][:-4] for p in mhd_list}
    diameter_dict = {}
    with open("data/annotations.csv", "r") as f:
        for row in list(csv.reader(f))[1:]:
            series_uid = row[0]
            annotationCenter_xyz = tuple([float(x) for x in row[1:4]])
            annotationDiameter_mm = float(row[4])
            diameter_dict.setdefault(series_uid, []).append((annotationCenter_xyz, annotationDiameter_mm))

    candidateInfo_list = []
    with open("data/candidates.csv", "r") as f:
        for row in list(csv.reader(f))[1:]:
            series_uid = row[0]
            
            if series_uid not in presentOnDisk_set and requireOnDisk_bool:
                continue
            isNodule_bool = bool(int(row[4]))
            candidateCenter_xyz = tuple([float(x) for x in row[1:4]])
            
            candidateDiameter_mm = 0.0
            for annotation_tup in diameter_dict.get(series_uid, []):
                annotationCenter_xyz, annotationDiameter_mm = annotation_tup
                for i in range(3):
                    delta_mm = abs(candidateCenter_xyz[i] - annotationCenter_xyz[i])
                    if delta_mm > annotationDiameter_mm / 4:
                        break
                else:
                    candidateDiameter_mm = annotationDiameter_mm
                    break
            candidateInfo_list.append(CandidateInfoTuple(
                isNodule_bool,
                candidateDiameter_mm,
                series_uid,
                candidateCenter_xyz
            ))
        candidateInfo_list.sort(reverse=True)
        return candidateInfo_list
                

In [14]:
x = getCandidateInfoList()
print(len(x))
print(x[:4])

110143
[CandidateInfoTuple(isNodule_bool=True, diameter_mm=32.27003025, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644280690737019247886', center_xyz=(67.61451718, 85.02525992, -109.8084416)), CandidateInfoTuple(isNodule_bool=True, diameter_mm=25.23320204, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.511347030803753100045216493273', center_xyz=(63.4740118048, 73.9174523314, -213.736128767)), CandidateInfoTuple(isNodule_bool=True, diameter_mm=23.35064438, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636438705059720603192', center_xyz=(57.42, 81.14, -118.09)), CandidateInfoTuple(isNodule_bool=True, diameter_mm=23.35064438, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636438705059720603192', center_xyz=(56.4889724157, 85.9418105037, -115.9731945))]


In [16]:
import SimpleITK as sitk

In [19]:
class Ct:
    def __init__(self, series_uid):
        mhd_path = glob.glob(
            "data/subset*/{}.mhd".format(series_uid)
        )[0]
        ct_mhd = sitk.ReadImage(mhd_path)
        ct_a = np.array(sitk.GetArrayFromImage(ct_mhd), dtype=np.float32)
        ct_a.clip(-1000, 1000, ct_a)
        self.series_uid = series_uid
        self.hu_a = ct_a