In [79]:
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
import torch
import torch.nn as nn
from torchvision.utils import make_grid
from torchvision.utils import save_image
from IPython.display import Image
import matplotlib.pyplot as plt
import numpy as np
import random
from torchvision.io import read_image
import os
%matplotlib inline

In [80]:
PATH_IMG = '../data/train_images_processed_512/'
PATH_META = '../data/train-3.csv'

In [81]:
import pandas as pd

df = pd.read_csv(PATH_META, sep=',', header=None)


In [82]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
count,54707,54707,54707,54707,54707,54670,54707,54707,54707,26287,54707,29471,54707,54707
unique,3,11914,54707,3,7,64,3,3,3,4,3,5,11,3
top,1,52868,image_id,R,MLO,50,0,0,0,1,0,B,49,False
freq,29519,14,1,27439,27903,2248,53548,51737,53888,15772,53229,12651,23529,47001


In [83]:
df.columns = df.iloc[0,:]
df = df.drop(index = 0)
df[14] = df.image_id

In [84]:
missing_values_count = df.isnull().sum()


missing_values_count[0:15]

0
site_id                        0
patient_id                     0
image_id                       0
laterality                     0
view                           0
age                           37
cancer                         0
biopsy                         0
invasive                       0
BIRADS                     28420
implant                        0
density                    25236
machine_id                     0
difficult_negative_case        0
14                             0
dtype: int64

In [85]:
df

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,14
1,2,10006,462822612,L,CC,61,0,0,0,,0,,29,False,462822612
2,2,10006,1459541791,L,MLO,61,0,0,0,,0,,29,False,1459541791
3,2,10006,1864590858,R,MLO,61,0,0,0,,0,,29,False,1864590858
4,2,10006,1874946579,R,CC,61,0,0,0,,0,,29,False,1874946579
5,2,10011,220375232,L,CC,55,0,0,0,0,0,,21,True,220375232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54702,1,9973,1729524723,R,MLO,43,0,0,0,1,0,C,49,False,1729524723
54703,1,9989,63473691,L,MLO,60,0,0,0,,0,C,216,False,63473691
54704,1,9989,1078943060,L,CC,60,0,0,0,,0,C,216,False,1078943060
54705,1,9989,398038886,R,MLO,60,0,0,0,0,0,C,216,True,398038886


In [86]:
# Convert object to int where we don't have missing value
list_to_num = ['site_id', 'patient_id', 'image_id', 'cancer', 'biopsy','invasive','implant','machine_id',14]
for i in list_to_num:
    df[i] = df[i].astype(int)
# Convert object to float where we have missing value
df['age'] = df['age'].astype('float')
df['BIRADS'] = df['BIRADS'].astype('float')
df['difficult_negative_case'] = df['difficult_negative_case'].astype('bool')


In [87]:
df.set_index('image_id')

Unnamed: 0_level_0,site_id,patient_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,14
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
462822612,2,10006,L,CC,61.0,0,0,0,,0,,29,True,462822612
1459541791,2,10006,L,MLO,61.0,0,0,0,,0,,29,True,1459541791
1864590858,2,10006,R,MLO,61.0,0,0,0,,0,,29,True,1864590858
1874946579,2,10006,R,CC,61.0,0,0,0,,0,,29,True,1874946579
220375232,2,10011,L,CC,55.0,0,0,0,0.0,0,,21,True,220375232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1729524723,1,9973,R,MLO,43.0,0,0,0,1.0,0,C,49,True,1729524723
63473691,1,9989,L,MLO,60.0,0,0,0,,0,C,216,True,63473691
1078943060,1,9989,L,CC,60.0,0,0,0,,0,C,216,True,1078943060
398038886,1,9989,R,MLO,60.0,0,0,0,0.0,0,C,216,True,398038886


In [88]:
df.dtypes

0
site_id                      int64
patient_id                   int64
image_id                     int64
laterality                  object
view                        object
age                        float64
cancer                       int64
biopsy                       int64
invasive                     int64
BIRADS                     float64
implant                      int64
density                     object
machine_id                   int64
difficult_negative_case       bool
14                           int64
dtype: object

In [89]:
# Fill missing value in age
df['age'] = df['age'].fillna(df['age'].mean())

In [90]:
# Fill missing value in density
temp = pd.DataFrame(df['density'])

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean.fit(temp)
new_temp = imp_mean.transform(temp)
np.squeeze(new_temp, axis=1)
df['density'] = new_temp

In [91]:
# Fill missing value in BIRADS by most frequent value
df['BIRADS'] = df['BIRADS'].fillna(int(df['BIRADS'].mode()))

In [92]:
missing_values_count = df.isnull().sum()


missing_values_count[0:15]

0
site_id                    0
patient_id                 0
image_id                   0
laterality                 0
view                       0
age                        0
cancer                     0
biopsy                     0
invasive                   0
BIRADS                     0
implant                    0
density                    0
machine_id                 0
difficult_negative_case    0
14                         0
dtype: int64

image_id	laterality	view   density

In [93]:
list_col = ['laterality','view','density']
for i in list_col:
    print(df[i].value_counts())

R    27439
L    27267
Name: laterality, dtype: int64
MLO    27903
CC     26765
AT        19
LM        10
ML         8
LMO        1
Name: view, dtype: int64
B    37887
C    12175
A     3105
D     1539
Name: density, dtype: int64


In [94]:
from sklearn.preprocessing import OrdinalEncoder

list_col = ['laterality','view','density']
for i in list_col:
    education_column = df[[i]]
    encoder = OrdinalEncoder()
    df[i] = encoder.fit_transform(education_column)

In [95]:
df.describe()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,14
count,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0
mean,1.460407,32698.865262,1079386000.0,0.501572,3.040526,58.543928,0.021168,0.054272,0.014953,0.890615,0.026999,1.22206,54.618378,1079386000.0
std,0.498434,18893.861534,618326900.0,0.500002,1.999777,10.047484,0.143944,0.226556,0.121365,0.424534,0.162081,0.585266,44.7848,618326900.0
min,1.0,5.0,68491.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,68491.0
25%,1.0,16481.0,545815300.0,0.0,1.0,51.0,0.0,0.0,0.0,1.0,0.0,1.0,29.0,545815300.0
50%,1.0,32432.0,1082689000.0,1.0,5.0,59.0,0.0,0.0,0.0,1.0,0.0,1.0,49.0,1082689000.0
75%,2.0,48999.0,1613228000.0,1.0,5.0,66.0,0.0,0.0,0.0,1.0,0.0,2.0,49.0,1613228000.0
max,2.0,65534.0,2147472000.0,1.0,5.0,89.0,1.0,1.0,1.0,2.0,1.0,3.0,216.0,2147472000.0


In [96]:
list_col = ['laterality','view','density']
for i in list_col:
    print(df[i].value_counts())

1.0    27439
0.0    27267
Name: laterality, dtype: int64
5.0    27903
1.0    26765
0.0       19
2.0       10
4.0        8
3.0        1
Name: view, dtype: int64
1.0    37887
2.0    12175
0.0     3105
3.0     1539
Name: density, dtype: int64
