In [None]:
!conda install gdcm -c conda-forge -y

In [None]:
import os

from PIL import Image
import pandas as pd
from tqdm.auto import tqdm

In [None]:
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

## pseudo labeling

In [None]:
psedo = pd.read_csv('../input/model-617-sub/submission.csv')

In [None]:
psedo.head()

In [None]:
psedo['PredictionString'][0].split(' ')

In [None]:
prob = []
prob.append(psedo['PredictionString'][1].split(' ')[1])
prob.append(psedo['PredictionString'][1].split(' ')[7])
prob.append(psedo['PredictionString'][1].split(' ')[13])
prob.append(psedo['PredictionString'][1].split(' ')[19])
print(prob)
print(np.argmax(prob))

In [None]:
prob = []
for idx, sub_id in enumerate(psedo['id']):
    prob_temp = []
    if sub_id.split('_')[-1] == 'study':
        prob_temp.append(psedo['PredictionString'][idx].split(' ')[1])
        prob_temp.append(psedo['PredictionString'][idx].split(' ')[7])
        prob_temp.append(psedo['PredictionString'][idx].split(' ')[13])
        prob_temp.append(psedo['PredictionString'][idx].split(' ')[19])
    
    if len(prob_temp) > 0:
        prob.append(np.argmax(prob_temp))

In [None]:
pd.Series(prob).value_counts()

In [None]:
if_study = []
for idx, sub_id in enumerate(psedo['id']):
    if sub_id.split('_')[-1] == 'study':
        if_study.append(1)
    else:
        if_study.append(0)

In [None]:
psedo_copy = psedo.copy()

In [None]:
psedo_copy['if_study'] = if_study
psedo_copy.head()

In [None]:
psedo_copy_study_only=psedo_copy[psedo_copy['if_study']==1]

In [None]:
psedo_copy_study_only.head()

In [None]:
psedo_copy_study_only.drop(['if_study'], inplace=True, axis=1)

In [None]:
psedo_copy_study_only['prob']=prob

In [None]:
psedo_copy_study_only.drop(['PredictionString'], inplace=True, axis=1)
psedo_copy_study_only.head()

In [None]:
ohe_hot=pd.get_dummies(psedo_copy_study_only['prob'])
ohe_hot.rename(columns = {"0": "Negative for Pneumonia",
                          "1": "Typical Appearance",
                          "2": "Indeterminate Appearance",
                          "3": "Atypical Appearance"}, inplace = True)
ohe_hot.head()

In [None]:
ohe_hot.columns = ["Negative for Pneumonia", "Typical Appearance",
                   "Indeterminate Appearance", "Atypical Appearance"]
ohe_hot['id'] = psedo_copy_study_only['id']
ohe_hot = ohe_hot[['id', 'Negative for Pneumonia', 'Typical Appearance', 
                   'Indeterminate Appearance', 'Atypical Appearance']]
ohe_hot.head()

In [None]:
ohe_hot.to_csv('617_Pseudo_Labelling.csv', index=False)

In [None]:
train = pd.read_csv('../input/siim-covid19-detection/train_study_level.csv')
train.head()

In [None]:
result=pd.concat([train, ohe_hot])
ohe_hot.to_csv('merge_617_Pseudo_Labelling.csv', index=False)

In [None]:
path = '../input/siim-covid19-detection/train/ae3e63d94c13/288554eb6182/e00f9fe0cce5.dcm'
dicom = pydicom.read_file(path)

In [None]:
split = 'train'
save_dir = f'/kaggle/tmp/{split}/'

os.makedirs(save_dir, exist_ok=True)
save_dir = f'/kaggle/tmp/{split}/image/'
os.makedirs(save_dir, exist_ok=True)

for dirname, _, filenames in tqdm(os.walk(f'../input/siim-covid19-detection/{split}')):
    for file in filenames:
        # set keep_ratio=True to have original aspect ratio
        xray = read_xray(os.path.join(dirname, file))
        im = resize(xray, size=512)  
        im.save(os.path.join(save_dir, file.replace('.dcm', '_image.png')))



In [None]:
split = 'train'
save_dir = f'/kaggle/tmp/{split}/study/'
os.makedirs(save_dir, exist_ok=True)

for dirname, _, filenames in tqdm(os.walk(f'../input/siim-covid19-detection/{split}')):
    for file in filenames:
        # set keep_ratio=True to have original aspect ratio
        xray = read_xray(os.path.join(dirname, file))
        im = resize(xray, size=600)  
        study = dirname.split('/')[-2] + '_study.png'
        im.save(os.path.join(save_dir, study))


In [None]:
%%time
!tar -zcf image.tar.gz -C "/kaggle/tmp/train/image/" .
!tar -zcf study.tar.gz -C "/kaggle/tmp/train/study/" .

In [None]:
!ls

In [None]:
!tar -zcf test_study.tar.gz -C "/kaggle/tmp/test/study/" .

In [None]:
!tar -zcf train_study.tar.gz -C "/kaggle/tmp/train/study/" .