<a href="https://colab.research.google.com/github/viveksahukar/sih/blob/main/dicom_to_png.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook details how the dicom files are converted to png that would be used as input to CNN.

In [None]:
# Import required libraries
from vs165_modules import *

## Reading axial slices first...

In [None]:
# Reading the csv file
# use r string in file path since space in file folder name
ax = pd.read_csv(r"ax_t1spgr+c_series_master_copy.csv")
ax['label'] = np.where(ax['dirpath'].str.contains('Positive', case=False, regex=True), 1, 0)

ax.rename(columns={'dirpath':'fpath'}, inplace=True)

In [None]:
# Reading dicom files for each series, selecting a particular slice and saving them for each Patient in a new DataFrame
series_df_all = pd.DataFrame()
columns = ['PatientID', 'SOPInstanceUID', 'InstanceNumber']
# series_df_all = pd.DataFrame(columns = ['PatientID', 'SOPInstanceUID', 'InstanceNumber'])
for index, row in ax.iterrows():
    series_df = pd.DataFrame()
    inputdir = row.fpath
    slices = [f for f in os.listdir(inputdir)]
    data = []
    for f in slices:
        ds = pydicom.dcmread(inputdir + '/' + f)
        values = [ds.PatientID, ds.SOPInstanceUID, ds.InstanceNumber]
        zipped = zip(columns, values)
        a_dictionary = dict(zipped)
        data.append(a_dictionary)
        series_df = series_df.append(a_dictionary, True)
    img_count = len(series_df)
    target = math.floor(img_count * 0.7) # Slice selection heuristic - gives one slide that contains most important feature for SIH
    series_df = series_df[series_df['InstanceNumber'] == target]
    series_df_all = series_df_all.append(series_df)

In [None]:
# Check for any mismatch in keys during merging of the above two dataframes
"""
These are the 2 patient id - 1 each in positive and negative case, where the slices in the series folder  
have duplicate Instance Number and hence when floor(image_count * 0.7) is taken, the value is outside the range of InstanceNumber
"""
ax2 = ax.merge(series_df_all, how='outer', indicator='True')
ax2[ax2['True'] != 'both']
ax2.info()
# For now leaving these two PatientID, putting them in test set and proceed as usual

In [None]:
# Merging the above two dataframes
ax_new = ax.merge(series_df_all, how='inner', on='PatientID')
ax_new['full_fpath'] = ax_new['fpath'] + '/' + ax_new['SOPInstanceUID'] + '.dcm'

In [None]:
# Saving ax_new to a dataframe and then csv, so all steps till here need not be repeated
ax_new.to_csv(r'ax_new.csv', index=False)

## Now reading coronal slices - similar method as for axial slices....

In [None]:
# Reading the csv file
# use r string in file path since space in file folder name
cor = pd.read_csv(r"cor_t1spgr+c_series_master_copy.csv")
cor['label'] = np.where(cor['dirpath'].str.contains('Positive', case=False, regex=True), 1, 0)

cor.rename(columns={'dirpath':'fpath'}, inplace=True)

In [None]:
# Reading dicom files for each series, selecting a particular slice and saving them for each Patient in a new DataFrame
series_df_all_cor = pd.DataFrame()
columns = ['PatientID', 'SOPInstanceUID', 'InstanceNumber']
# series_df_all = pd.DataFrame(columns = ['PatientID', 'SOPInstanceUID', 'InstanceNumber'])
for index, row in cor.iterrows():
    series_df_cor = pd.DataFrame()
    inputdir = row.fpath
    slices = [f for f in os.listdir(inputdir)]
    data = []
    for f in slices:
        ds = pydicom.dcmread(inputdir + '/' + f)
        values = [ds.PatientID, ds.SOPInstanceUID, ds.InstanceNumber]
        zipped = zip(columns, values)
        a_dictionary = dict(zipped)
        data.append(a_dictionary)
        series_df_cor = series_df_cor.append(a_dictionary, True)
    img_count = len(series_df_cor)
    target1 = math.floor(img_count * 0.3)     # Slice selection heuristic - We are choosing 2 slices for each series
    target2 = math.floor(img_count * 0.625)
    series_df_cor = series_df_cor[(series_df_cor['InstanceNumber'] == target1) | (series_df_cor['InstanceNumber'] == target2)]
    series_df_all_cor = series_df_all_cor.append(series_df_cor)

In [None]:
# Check for any mismatch in keys during merging of the above two dataframes
cor2 = cor.merge(series_df_all_cor, how='outer', indicator='True')
cor2 = cor2[cor2['True'] != 'both']
cor2.info()
# All good....

In [None]:
# Saving cor_new to a df and then csv, so all steps till here not to be repeated
cor_new.to_csv(r'cor_new.csv', index=False)

## Now loading axial dataframe that contains one slice for each series for each PatientID and preparing the train, test and validation sets

In [None]:
#Load csv dataframe
df_ax = pd.read_csv(r'ax_new.csv')

In [None]:
# Separating into positive and negative cases
ax_pos = df_ax[df_ax.label == 1]
ax_neg = df_ax[df_ax.label == 0]

In [None]:
# to check if any duplicates in PatientID
boolean = ax_pos['PatientID'].is_unique
boolean = ax_neg['PatientID'].is_unique 
# no duplicates found

#### **Number of patients for positive and negative cases in train, validation and test sets**
| Case | Train | Validation | Test | Total |
| :-- | --: | --: | --: | --: |
| Axial - Positive | 51 | 34 | 85 | 170 |
| Axial - Negative | 93 | 61 | 154 | 308 |
| Axial - Total | 144 | 95 | 239 | 478 |


In [None]:
# Dividing into different dataframes as per above table
ax_pos_train = ax_pos.iloc[:51, :]
ax_pos_val = ax_pos.iloc[51:85, :]
ax_pos_test = ax_pos.iloc[85:, :]

ax_neg_train = ax_neg.iloc[:93, :]
ax_neg_val = ax_neg.iloc[93:154, :]
ax_neg_test = ax_neg.iloc[154:, :]

In [None]:
# Creating data directories for storing png files
ax_pos_train_dir = 'data_ax/ax_train/1/'  
ax_pos_val_dir = 'data_ax/ax_val/1/'
ax_pos_test_dir = 'data_ax/ax_test/1/' 

ax_neg_train_dir = 'data_ax/ax_train/0/'  
ax_neg_val_dir = 'data_ax/ax_val/0/'
ax_neg_test_dir = 'data_ax/ax_test/0/' 

In [None]:
IMG_PX_SIZE = 224 # set image size to 224 x 224 - resnet standards
def get_png(df, folder):
# Convert dcm to png using imageio and saves png in respective folders
    for index, row in df.iterrows():
        inputdir = row.full_fpath
        ds = pydicom.dcmread(inputdir)
        brks = ds.scaled_px.freqhist_bins(n_bins=256)
        ds_scaled = ds.hist_scaled(brks=brks, min_px=50)
        img = cv2.resize(np.array(ds_scaled), (IMG_PX_SIZE, IMG_PX_SIZE))
        img = np.repeat(img[..., np.newaxis], 3, -1) # copying across other 2 dim to make 3d image
        rescaled = ((255.0 / img.max() * (img - img.min()))).astype(np.uint8)
        imageio.imwrite(folder + str(row.PatientID) + '_' + row.SOPInstanceUID + '.png', rescaled) 

In [None]:
# Applying above function to convert each dicom slice per PatientID to png and saving it in respective folders
get_png(ax_pos_train, ax_pos_train_dir)
get_png(ax_neg_train, ax_neg_train_dir)
get_png(ax_pos_val, ax_pos_val_dir)
get_png(ax_neg_val, ax_neg_val_dir)
get_png(ax_pos_test, ax_pos_test_dir)
get_png(ax_neg_test, ax_neg_test_dir)

In [None]:
def create_label(folder, label_filename, save_location):
    # Create label file for slices, takes input the data directory to be traversed, name of csv filename, and path location where csv is to be saved
    df = pd.DataFrame(columns=['img', 'label'])
    for root, dir, files in os.walk(folder):
        files = [f for f in files if not f.startswith('~')]
        df1 = pd.DataFrame({'img': files, 'label': 1})
        df = df.append(df1)
    df.to_csv(save_location + label_filename + '.csv', index=False)

In [None]:
# Create label files for each dataset
axial_data_dir =  'data_ax_cor/data_ax/'
create_label(ax_pos_train_dir, 'ax_pos_train_label', axial_data_dir)
create_label(ax_pos_val_dir, 'ax_pos_val_label', axial_data_dir)
create_label(ax_pos_test_dir, 'ax_pos_test_label', axial_data_dir)

create_label(ax_neg_train_dir, 'ax_neg_train_label', axial_data_dir)
create_label(ax_neg_val_dir, 'ax_neg_val_label', axial_data_dir)
create_label(ax_neg_test_dir, 'ax_neg_test_label', axial_data_dir)

## Now, repeating the above same steps - histogram normalization and saving to png for coronal axis

In [None]:
#Load csv dataframe for axial images
df_cor = pd.read_csv(r'test/cor_new.csv')

In [None]:
# Separating into positive and negative cases
cor_pos = df_cor[df_cor.label == 1]
cor_neg = df_cor[df_cor.label == 0]

### Keeping Train (30%), Validation (20%), Test (50%) for each positive and negative cases

#### **Number of slices - 2 for each patient for positive and negative cases in train, validation and test sets**
| Case | Train | Validation | Test | Total |
| :-- | --: | --: | --: | --: |
| Coronal - Positive | 100 | 64 | 162 | 326 |
| Coronal - Negative | 166 | 110 | 274 | 550 |
| Coronal - Total | 266 | 174 | 436 | 876 |

In [None]:
# Dividing into different dataframes as per above table

cor_pos_train = cor_pos.iloc[:100, :]
cor_pos_val = cor_pos.iloc[100:164, :]
cor_pos_test = cor_pos.iloc[164:, :]

cor_neg_train = cor_neg.iloc[:166, :]
cor_neg_val = cor_neg.iloc[166:276, :]
cor_neg_test = cor_neg.iloc[276:, :]

In [None]:
# Checked that all dataframes have unique PatientID and no PatientID is present in more than one set.
df = cor_neg_test.merge(cor_neg_val, how='outer', indicator='True', on='PatientID')
df['True'].value_counts()

# Similarly check for all combinations

# All good.....

In [None]:
# Creating data directories for storing png files
cor_pos_train_dir = 'cor_train/1/'  
cor_pos_val_dir = 'cor_val/1/'
cor_pos_test_dir = 'cor_test/1/' 

cor_neg_train_dir = 'cor_train/0/'  
cor_neg_val_dir = 'cor_val/0/'
cor_neg_test_dir = 'cor_test/0/' 

In [None]:
# Applying dicom to png function to convert each dicom slice per PatientID to png and saving it in respective folders
get_png(cor_pos_train, cor_pos_train_dir)
get_png(cor_neg_train, cor_neg_train_dir)
get_png(cor_pos_val, cor_pos_val_dir)
get_png(cor_neg_val, cor_neg_val_dir)
get_png(cor_pos_test, cor_pos_test_dir)
get_png(cor_neg_test, cor_neg_test_dir)

In [None]:
# Create label files for each dataset
coronal_data_dir =  'data_cor/'
create_label(cor_pos_train_dir, 'cor_pos_train_label', coronal_data_dir)
create_label(cor_pos_val_dir, 'cor_pos_val_label', coronal_data_dir)
create_label(cor_pos_test_dir, 'cor_pos_test_label', coronal_data_dir)

create_label(cor_neg_train_dir, 'cor_neg_train_label', coronal_data_dir)
create_label(cor_neg_val_dir, 'cor_neg_val_label', coronal_data_dir)
create_label(cor_neg_test_dir, 'cor_neg_test_label', coronal_data_dir)

### Checking Patients that have both axial and coronal slices

In [None]:
# Dropping duplicate patient ID in coronal and axial dataframes
df_ax_unique = df_ax.drop_duplicates(subset=['PatientID'])
df_cor_unique = df_cor.drop_duplicates(subset=['PatientID'])

df_cor_unique_pos = df_cor_unique[df_cor_unique.label == 1]
df_cor_unique_neg = df_cor_unique[df_cor_unique.label == 0]
df_ax_unique_pos = df_ax_unique[df_ax_unique.label == 1]
df_ax_unique_neg = df_ax_unique[df_ax_unique.label == 0]

In [None]:
# Checked for common Patients in entire axial and coronal datasets
df = df_ax_unique.merge(df_cor_unique, how='outer', indicator='True', on='PatientID')
df['True'].value_counts()
# All good.....

In [None]:
# Checked for common Patients in axial and coronal datasets - for positive cases only
df_pos = df_ax_unique_pos.merge(df_cor_unique_pos, how='outer', indicator='True', on='PatientID')
df_pos['True'].value_counts()
# All good.....

In [None]:
# Checked for common Patients in axial and coronal datasets - for negative cases only
df_neg = df_ax_unique_neg.merge(df_cor_unique_neg, how='outer', indicator='True', on='PatientID')
df_neg['True'].value_counts()
# All good.....

#### **Number of patients having slices for both axes - axial and coronal in positive and negative cases**
| Case | Both Axial & Coronal | Only Axial | Only Coronal |
| :-- | --: | --: | --: |
| Positive | 155 | 15 | 9 |
| Negative | 273 | 35 | 3 |
| Total | 428 | 50 | 12 |