In [1]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torchvision
import torch.nn as nn
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../scripts/')
from model import get_res50
from loss import bce
import glob, os
from metrics import precision, recall
from dataset import MayoData, train_loader, val_loader, test_loader, plot_images
from PIL import Image
from tqdm import tqdm
import random

In [2]:
torch.__version__

'2.0.0+cu117'

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
train_csv = pd.read_csv('../../mayo_data/train.csv')

In [5]:
train_csv.head(2)

Unnamed: 0,image_id,center_id,patient_id,image_num,label
0,006388_0,11,006388,0,CE
1,008e5c_0,11,008e5c,0,CE


In [6]:
train_csv['label'].value_counts()

CE     547
LAA    207
Name: label, dtype: int64

In [7]:
train_csv['updated_paths'] = train_csv['image_id'].apply(lambda x: '../../mayo_data/train_resized/2048/' + x + '.jpg')

In [8]:
train_csv.head(2)

Unnamed: 0,image_id,center_id,patient_id,image_num,label,updated_paths
0,006388_0,11,006388,0,CE,../../mayo_data/train_resized/2048/006388_0.jpg
1,008e5c_0,11,008e5c,0,CE,../../mayo_data/train_resized/2048/008e5c_0.jpg


In [9]:
def convert_targets(data):
    if data == 'CE':
        return 1
    else:
        return 0

In [10]:
train_csv['target'] = train_csv['label'].apply(convert_targets)

In [11]:
train_csv.head(2)

Unnamed: 0,image_id,center_id,patient_id,image_num,label,updated_paths,target
0,006388_0,11,006388,0,CE,../../mayo_data/train_resized/2048/006388_0.jpg,1
1,008e5c_0,11,008e5c,0,CE,../../mayo_data/train_resized/2048/008e5c_0.jpg,1


In [12]:
def split_datasets(data, test_size = 0.1):
    train, val = train_test_split(data, test_size=test_size, random_state=42)
    train, test = train_test_split(train, test_size=test_size, random_state=42)
    return train, val, test

In [13]:
train, val, test = split_datasets(train_csv)

### Summary Stats

In [14]:
def summary_metrics_dataset(dataset: pd.DataFrame):
    temp = dict(dataset['target'].value_counts())
    print(sum(temp.values()))
    for k, v in temp.items():
        t = round((v / sum(temp.values())), 2)
        print(f'For {k} : {v}')
        print(f'For {k}: {t}%')

In [15]:
summary_metrics_dataset(train)

610
For 1 : 447
For 1: 0.73%
For 0 : 163
For 0: 0.27%


In [16]:
summary_metrics_dataset(val)

76
For 1 : 48
For 1: 0.63%
For 0 : 28
For 0: 0.37%


In [17]:
summary_metrics_dataset(test)

68
For 1 : 52
For 1: 0.76%
For 0 : 16
For 0: 0.24%


### Gather datasets

In [18]:
train_l, val_l, test_l = train_loader(train, 256), val_loader(val, 256), test_loader(test, 256)