In [1]:
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
import torch
import torch.nn as nn
from torchvision.utils import make_grid
from torchvision.utils import save_image
from IPython.display import Image
import matplotlib.pyplot as plt
import numpy as np
import random
from torchvision.io import read_image
import os
import cv2 as cv
import sys
from torch.utils.data import random_split
%matplotlib inline

In [2]:
PATH_IMG = '../data/train_images_processed_512/'
PATH_META = '../data/train-3.csv'

In [3]:
import pandas as pd

df = pd.read_csv(PATH_META, sep=',', header=None)


In [4]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
count,54707,54707,54707,54707,54707,54670,54707,54707,54707,26287,54707,29471,54707,54707
unique,3,11914,54707,3,7,64,3,3,3,4,3,5,11,3
top,1,52868,image_id,R,MLO,50,0,0,0,1,0,B,49,False
freq,29519,14,1,27439,27903,2248,53548,51737,53888,15772,53229,12651,23529,47001


In [5]:
df.columns = df.iloc[0,:]
df = df.drop(index = 0)
df = df.reset_index(drop=True)


In [6]:
df

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55,0,0,0,0,0,,21,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54701,1,9973,1729524723,R,MLO,43,0,0,0,1,0,C,49,False
54702,1,9989,63473691,L,MLO,60,0,0,0,,0,C,216,False
54703,1,9989,1078943060,L,CC,60,0,0,0,,0,C,216,False
54704,1,9989,398038886,R,MLO,60,0,0,0,0,0,C,216,True


In [7]:
missing_values_count = df.isnull().sum()


missing_values_count[0:15]

0
site_id                        0
patient_id                     0
image_id                       0
laterality                     0
view                           0
age                           37
cancer                         0
biopsy                         0
invasive                       0
BIRADS                     28420
implant                        0
density                    25236
machine_id                     0
difficult_negative_case        0
dtype: int64

In [8]:
df

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55,0,0,0,0,0,,21,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54701,1,9973,1729524723,R,MLO,43,0,0,0,1,0,C,49,False
54702,1,9989,63473691,L,MLO,60,0,0,0,,0,C,216,False
54703,1,9989,1078943060,L,CC,60,0,0,0,,0,C,216,False
54704,1,9989,398038886,R,MLO,60,0,0,0,0,0,C,216,True


In [9]:
# Convert object to int where we don't have missing value
list_to_num = ['site_id', 'cancer', 'biopsy','invasive','implant','machine_id']
for i in list_to_num:
    df[i] = df[i].astype(int)
# Convert object to float where we have missing value
df['age'] = df['age'].astype('float')
df['BIRADS'] = df['BIRADS'].astype('float')


In [10]:
df.dtypes

0
site_id                      int64
patient_id                  object
image_id                    object
laterality                  object
view                        object
age                        float64
cancer                       int64
biopsy                       int64
invasive                     int64
BIRADS                     float64
implant                      int64
density                     object
machine_id                   int64
difficult_negative_case     object
dtype: object

In [11]:
# Make difficult_negative_case  all upper case 
df['difficult_negative_case'] = df['difficult_negative_case'].astype('str').str.upper()

In [12]:
# Fill missing value in age
df['age'] = df['age'].fillna(df['age'].mean())

In [13]:
# Fill missing value in density
temp = pd.DataFrame(df['density'])

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean.fit(temp)
new_temp = imp_mean.transform(temp)
np.squeeze(new_temp, axis=1)
df['density'] = new_temp

In [14]:
# Fill missing value in BIRADS by most frequent value
df['BIRADS'] = df['BIRADS'].fillna(int(df['BIRADS'].mode()))

In [15]:
missing_values_count = df.isnull().sum()


missing_values_count[0:15]

0
site_id                    0
patient_id                 0
image_id                   0
laterality                 0
view                       0
age                        0
cancer                     0
biopsy                     0
invasive                   0
BIRADS                     0
implant                    0
density                    0
machine_id                 0
difficult_negative_case    0
dtype: int64

In [16]:
list_col = ['laterality','view','density','difficult_negative_case']
for i in list_col:
    print(df[i].value_counts())

R    27439
L    27267
Name: laterality, dtype: int64
MLO    27903
CC     26765
AT        19
LM        10
ML         8
LMO        1
Name: view, dtype: int64
B    37887
C    12175
A     3105
D     1539
Name: density, dtype: int64
FALSE    47001
TRUE      7705
Name: difficult_negative_case, dtype: int64


In [17]:
from sklearn.preprocessing import OrdinalEncoder

list_col = ['laterality','view','density','difficult_negative_case']
for i in list_col:
    education_column = df[[i]]
    encoder = OrdinalEncoder()
    df[i] = encoder.fit_transform(education_column)

In [18]:
df.describe()

Unnamed: 0,site_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
count,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0,54706.0
mean,1.460407,0.501572,3.040526,58.543928,0.021168,0.054272,0.014953,0.890615,0.026999,1.22206,54.618378,0.140844
std,0.498434,0.500002,1.999777,10.047484,0.143944,0.226556,0.121365,0.424534,0.162081,0.585266,44.7848,0.347864
min,1.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0
25%,1.0,0.0,1.0,51.0,0.0,0.0,0.0,1.0,0.0,1.0,29.0,0.0
50%,1.0,1.0,5.0,59.0,0.0,0.0,0.0,1.0,0.0,1.0,49.0,0.0
75%,2.0,1.0,5.0,66.0,0.0,0.0,0.0,1.0,0.0,2.0,49.0,0.0
max,2.0,1.0,5.0,89.0,1.0,1.0,1.0,2.0,1.0,3.0,216.0,1.0


In [19]:
df

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,0.0,1.0,61.0,0,0,0,1.0,0,1.0,29,0.0
1,2,10006,1459541791,0.0,5.0,61.0,0,0,0,1.0,0,1.0,29,0.0
2,2,10006,1864590858,1.0,5.0,61.0,0,0,0,1.0,0,1.0,29,0.0
3,2,10006,1874946579,1.0,1.0,61.0,0,0,0,1.0,0,1.0,29,0.0
4,2,10011,220375232,0.0,1.0,55.0,0,0,0,0.0,0,1.0,21,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54701,1,9973,1729524723,1.0,5.0,43.0,0,0,0,1.0,0,2.0,49,0.0
54702,1,9989,63473691,0.0,5.0,60.0,0,0,0,1.0,0,2.0,216,0.0
54703,1,9989,1078943060,0.0,1.0,60.0,0,0,0,1.0,0,2.0,216,0.0
54704,1,9989,398038886,1.0,5.0,60.0,0,0,0,0.0,0,2.0,216,1.0


In [20]:
list_col = ['laterality','view','density','difficult_negative_case']
for i in list_col:
    print(df[i].value_counts())

1.0    27439
0.0    27267
Name: laterality, dtype: int64
5.0    27903
1.0    26765
0.0       19
2.0       10
4.0        8
3.0        1
Name: view, dtype: int64
1.0    37887
2.0    12175
0.0     3105
3.0     1539
Name: density, dtype: int64
0.0    47001
1.0     7705
Name: difficult_negative_case, dtype: int64


In [21]:
# Example how cv works
img = cv.imread(cv.samples.findFile(PATH_IMG+"/5/"+"640805896.png"))
if img is None:
    print('not working')
img

array([[[155, 155, 155],
        [153, 153, 153],
        [157, 157, 157],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[150, 150, 150],
        [150, 150, 150],
        [152, 152, 152],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[150, 150, 150],
        [149, 149, 149],
        [152, 152, 152],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       ...,

       [[158, 158, 158],
        [156, 156, 156],
        [150, 150, 150],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[155, 155, 155],
        [156, 156, 156],
        [157, 157, 157],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]],

       [[158, 158, 158],
        [160, 160, 160],
        [157, 157, 157],
        ...,
        [  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0]]

In [22]:
class CustomImageDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None, target_transform=None):
        self.img_labels = dataframe
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, str(self.img_labels.loc[idx,'patient_id'])+'/'+str(self.img_labels.loc[idx,'image_id'])+'.png')
        image = read_image(img_path)
        label = self.img_labels.loc[idx, 'difficult_negative_case']
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return dict(image = torch.tensor(image),label = torch.tensor(label),metadata = torch.tensor(self.img_labels.loc[idx,'laterality':'machine_id']))


In [23]:
torch_dataset = CustomImageDataset(df, PATH_IMG)

In [24]:
torch_dataset[9]

  return dict(image = torch.tensor(image),label = torch.tensor(label),metadata = torch.tensor(self.img_labels.loc[idx,'laterality':'machine_id']))


{'image': tensor([[[164, 169, 165,  ...,   0,   0,   0],
          [165, 162, 164,  ...,   0,   0,   0],
          [165, 164, 162,  ...,   0,   0,   0],
          ...,
          [113, 104, 107,  ...,   0,   0,   0],
          [118, 109, 103,  ...,   0,   0,   0],
          [115, 111, 112,  ...,   0,   0,   0]]], dtype=torch.uint8),
 'label': tensor(0., dtype=torch.float64),
 'metadata': tensor([ 0.,  5., 75.,  0.,  0.,  0.,  1.,  0.,  1., 29.], dtype=torch.float64)}

In [25]:
print("length of the dataset is:", len(torch_dataset))

length of the dataset is: 54706


In [26]:
# Divide dataset 70/30
train_dataset, test_dataset = random_split(torch_dataset, [38294, 16412], generator=torch.Generator().manual_seed(42))
print("The length of train data is:",len(train_dataset))

print("The length of test data is:",len(test_dataset))

The length of train data is: 38294
The length of test data is: 16412


In [27]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [29]:
train_dataset[0]['image'].shape

  return dict(image = torch.tensor(image),label = torch.tensor(label),metadata = torch.tensor(self.img_labels.loc[idx,'laterality':'machine_id']))


torch.Size([1, 512, 512])

In [30]:
train_dict = next(iter(train_dataloader))

print(train_dict['image'].shape)
print(train_dict['label'])
print(train_dict['metadata'])

  return dict(image = torch.tensor(image),label = torch.tensor(label),metadata = torch.tensor(self.img_labels.loc[idx,'laterality':'machine_id']))


torch.Size([64, 1, 512, 512])
tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1.], dtype=torch.float64)
tensor([[  0.,   1.,  45.,   0.,   0.,   0.,   1.,   0.,   2.,  49.],
        [  1.,   5.,  60.,   0.,   0.,   0.,   1.,   0.,   1.,  29.],
        [  1.,   5.,  46.,   0.,   0.,   0.,   0.,   0.,   1.,  49.],
        [  0.,   5.,  69.,   0.,   0.,   0.,   1.,   0.,   1.,  49.],
        [  1.,   1.,  47.,   0.,   0.,   0.,   1.,   1.,   0.,  49.],
        [  0.,   5.,  59.,   0.,   0.,   0.,   1.,   0.,   1.,  29.],
        [  0.,   1.,  31.,   0.,   0.,   0.,   1.,   0.,   3.,  49.],
        [  1.,   5.,  68.,   0.,   0.,   0.,   1.,   0.,   1.,  49.],
        [  1.,   1.,  53.,   0.,   0.,   0.,   1.,   0.,   1.,  21.],
        [  1.,   1.,  63.,   0

In [None]:
from torchvision.models import resnet50