# Исходные данные 

In [1]:
import os

import re
import json

import numpy as np
import pandas as pd 

import pydicom

import matplotlib.pyplot as plt

In [2]:
basedir = "E:\data\For_Publication_v3"

### Протоколы КТ-исследований. Загрузим информацию об обнаруженных узлах

In [3]:
protocols_dir = f"{basedir}\\Protocols"

In [4]:
protocol_files = [f'{protocols_dir}\\{file}' for file in os.listdir(protocols_dir)]
len(protocol_files)

541

In [5]:
def read_json(filename, encoding='utf-8-sig'):
    with open(filename, "r", encoding=encoding) as file:
        text = file.read()
        return json.loads(text)

In [6]:
protocols_common_rows = []
protocols_positions_rows = []
for pfile in protocol_files:
    protocol_json = read_json(pfile)
    
    for comment_json in protocol_json['doctors']:
        protocols_common_rows.append((
           protocol_json['ids']['study id'],
           comment_json['id'],
           comment_json['comment'],
           protocol_json['ids']['accession number']
        ))
        
    for nodules_json_arr in (protocol_json['nodules'] or []):
        nodules_json = nodules_json_arr[0]

        for nodule_id in nodules_json:
            if nodules_json[nodule_id] is not None:
                protocols_positions_rows.append((
                    protocol_json['ids']['study id'],
                    nodule_id,
                    nodules_json[nodule_id]['diameter (mm)'],                    
                    nodules_json[nodule_id]['x'],
                    nodules_json[nodule_id]['y'],                    
                    nodules_json[nodule_id]['z'],                    
                    nodules_json[nodule_id]['z type'],                    
                    nodules_json[nodule_id]['expert decision'][0]['comment'],
                    nodules_json[nodule_id]['expert decision'][0]['decision'],
                    nodules_json[nodule_id]['expert decision'][0]['id'],
                    nodules_json[nodule_id]['expert decision'][0]['machine learning'],
                    nodules_json[nodule_id]['expert decision'][0]['type'],      
                    protocol_json['ids']['accession number']
                ))

In [7]:
col_names = ('study_id', 'nodule_id', 'comment', 'accession_number')
protocols_common = pd.DataFrame(data=protocols_common_rows, columns=col_names)
protocols_common.head()

Unnamed: 0,study_id,nodule_id,comment,accession_number
0,RLS5A09001KDC6-K00008714,4,Нет,RLAD31D006-11315
1,RLS5A09001KDC6-K00008714,9,Нет,RLAD31D006-11315
2,RLS5A09001KDC6-K00008714,12,,RLAD31D006-11315
3,RLS5A09001KDC6-K00008714,0,Нет очагов,RLAD31D006-11315
4,RLS5A09001KDC6-K00008714,2,Нет,RLAD31D006-11315


In [8]:
col_names = ('study_id', 'nodule_id', 'diameter', 'x', 'y', 'z', 'z_type', 'comment', 'decision', 'id', 'ml', 'type', 'accession_number')
nodules_positions = pd.DataFrame(data=protocols_positions_rows, columns=col_names)
nodules_positions.head(3)

Unnamed: 0,study_id,nodule_id,diameter,x,y,z,z_type,comment,decision,id,ml,type,accession_number
0,RLS5A09001KDC6-K00008714,2,6.0,400.0,281.0,1698.3,mm,неверный размер,confirmed_partially,СВЕ,False,м,RLAD31D006-11315
1,RLS5A09001KDC6-K00008714,2,6.0,417.0,251.0,1632.7,mm,"несовпадение типа (п), неверный размер",confirmed_partially,СВЕ,True,п,RLAD31D006-11315
2,RLS5A09001KDC6-K00008714,2,4.0,422.0,312.0,1594.3,mm,неверный размер,confirmed_partially,СВЕ,True,с,RLAD31D006-11315


### Результаты КТ.  Разберем файлы и соберем инфомацию о конкретных снимках

In [9]:
specific_tags_list = ['SeriesInstanceUID', 'SeriesDescription', 'ImagePositionPatient', 'PixelSpacing', 'SliceLocation']

def read_plans_tags(accession_number, study_id):
    dcm_base_dir =   f"{basedir}\\Dicom\\{accession_number}_{study_id}\\CT"
    dcm_dir = f"{dcm_base_dir}\\{os.listdir(dcm_base_dir)[0]}"
    dcm_files = [f"{dcm_dir}\\{file_name}" for file_name in os.listdir(dcm_dir)]
    plans = [(file, pydicom.read_file(file, specific_tags=specific_tags_list, stop_before_pixels=True)) for file in dcm_files]
    lung_plans = [(file, plan) for (file, plan) in plans]
    return lung_plans

In [10]:
studies = protocols_common[['study_id', 'accession_number']].drop_duplicates()

In [11]:
studies.iloc[0]['accession_number']

'RLAD31D006-11315'

In [12]:
%%time
plan_instance_rows = []
for index, row in [x for x in studies.iterrows()]:
    plans = read_plans_tags(row['accession_number'], row['study_id'])
    for (file, plan) in plans:
        plan_tags_dict = dict([(tag.name, tag.value) for tag in plan])         
        x, y, z = plan['ImagePositionPatient'].value
        plan_instance_rows.append((row['study_id'], plan['SeriesDescription'].value, z, plan['SliceLocation'].value, plan['SeriesInstanceUID'].value, file))

Wall time: 57min 45s


In [13]:
plans_col_names = ('study_id', 'desc', 'z_mm', 'location', 'instance_uid', 'dcm_path')
plan_instances = pd.DataFrame(data=plan_instance_rows, columns=plans_col_names).sort_values(['study_id', 'z_mm'])
plan_instances.head(3)

Unnamed: 0,study_id,desc,z_mm,location,instance_uid,dcm_path
2744,RLS5A09001KDC6-K00004273,Inspirat. Body 5.0 Inspirat.,1660.5,300.0,1.2.276.0.7230010.3.1.3.1417964692.10880.15687...,E:\data\For_Publication_v3\Dicom\RLAD31D006-12...
2813,RLS5A09001KDC6-K00004273,Inspirat. Lung 5.0 Inspirat.,1660.5,300.0,1.2.276.0.7230010.3.1.3.1417964692.10880.15687...,E:\data\For_Publication_v3\Dicom\RLAD31D006-12...
3183,RLS5A09001KDC6-K00004273,Inspirat. Body 1.0 Inspirat.,1660.5,300.0,1.2.276.0.7230010.3.1.3.1417964692.10880.15687...,E:\data\For_Publication_v3\Dicom\RLAD31D006-12...


### Сохраним данные

In [14]:
protocols_common.to_csv('data/protocols_common', index=False)
nodules_positions.to_csv('data/nodules_positions', index=False)
plan_instances.to_csv("data/plan_instances", index=False)