# Prepare Datasets

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from openslide import OpenSlide
from pathlib import Path
import torch
import torchvision
from torchvision import transforms
from PIL import Image

SEED = 42

In [2]:
def random_sample(x, k=16):
    B, H = x.shape
    x = x.transpose(0, 1)
    indices = torch.randperm(B)[:k]
    return x[:, indices].transpose(0, 1)

In [3]:
outcome_df = pd.read_csv('/home/ngsci/datasets/brca-psj-path/contest-phase-2/csv-train/outcomes.csv')
map_df = pd.read_csv('/home/ngsci/datasets/brca-psj-path/v2/slide-biopsy-map.csv')
# Unique labels ['IA', 'IIB', 'IIA', '0', nan, 'IIIC', 'IV', 'IIIA', 'IIIB', 'IB']
label_mapping = {'IA':1, 'IB':1, 'IIA':2, 'IIB':2, 'IIIA':3, 'IIIB':3, 'IIIC':3, 'IV':4, '0':0}
outcome_df['label'] = outcome_df['stage'].map(label_mapping)
outcome_df = outcome_df[~pd.isna(outcome_df['label'])]

In [4]:
len(outcome_df)

1000

In [5]:
# stratified_df = outcome_df.groupby('label', group_keys=False).apply(lambda x: x.sample(min(len(x), 50)))
train_N = 800
train_df = outcome_df.groupby('label', group_keys=False).apply(lambda x: x.sample(int(np.rint(train_N*len(x)/len(outcome_df))), random_state=SEED)).sample(frac=1, random_state=SEED).reset_index(drop=True)

test_N = 200
remained_df = outcome_df[~outcome_df['biopsy_id'].isin(train_df['biopsy_id'])]
test_df = remained_df.groupby('label', group_keys=False).apply(lambda x: x.sample(int(np.rint(test_N*len(x)/len(remained_df))), random_state=SEED)).sample(frac=1, random_state=SEED).reset_index(drop=True)

train_mapping = map_df[map_df['biopsy_id'].isin(train_df['biopsy_id'])]
test_mapping = map_df[map_df['biopsy_id'].isin(test_df['biopsy_id'])]
print('Train slides: %d\n'%len(train_mapping))
print('Test slides: %d\n'%len(test_mapping))

# path_prefix = '/home/ngsci/datasets/brca-psj-path/contest-phase-2/png-downsampled-train/'
# train_mapping['downsampled_path'] = path_prefix + train_mapping['slide_id'] + '.png'
# test_mapping['downsampled_path'] = path_prefix + test_mapping['slide_id'] + '.png'
path_prefix = '/home/ngsci/datasets/brca-psj-path/contest-phase-2/clam-preprocessing-train/resnet50-features/pt_files/'
train_mapping['downsampled_path'] = path_prefix + train_mapping['slide_id'] + '.pt'
test_mapping['downsampled_path'] = path_prefix + test_mapping['slide_id'] + '.pt'

Train slides: 10580

Test slides: 2545



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_mapping['downsampled_path'] = path_prefix + train_mapping['slide_id'] + '.pt'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_mapping['downsampled_path'] = path_prefix + test_mapping['slide_id'] + '.pt'


In [6]:
paths = list(Path(path_prefix).iterdir())
for idx in tqdm(range(0, len(paths))):
    path = paths[idx]
    slide_id = str(path).split('/')[-1]
    feat = torch.load(path)
    x_mean = torch.mean(feat, dim=0).unsqueeze(dim=0)
    x_sampled = random_sample(feat, k=16)
    # x2 = torch.max(feat, dim=0).values
    # x3 = torch.min(feat, dim=0).values
    # x = torch.stack([x1,x2,x3])
    x = torch.vstack([x_mean, x_sampled])
    torch.save(x, f'./datasets/train/{slide_id}')

100%|██████████| 10846/10846 [2:50:09<00:00,  1.06it/s] 


In [7]:
def check_path_exists(path):
    return Path(path).is_file()

train_mask = train_mapping['downsampled_path'].apply(check_path_exists)
train_mapping = train_mapping[train_mask].reset_index()
print('Train slides filtered:', len(train_mapping))

test_mask = test_mapping['downsampled_path'].apply(check_path_exists)
test_mapping = test_mapping[test_mask].reset_index()
print('Test slides filtered:', len(test_mapping))

Train slides filtered: 8728
Test slides filtered: 2118


In [8]:
check_res = train_mapping['downsampled_path'].map(lambda x: Path(x).is_file())
assert(check_res.sum() == len(check_res))

check_res = test_mapping['downsampled_path'].map(lambda x: Path(x).is_file())
assert(check_res.sum() == len(check_res))

In [9]:
train_df.to_csv('./csv_dir/train_outcomes.csv', index=False)
test_df.to_csv('./csv_dir/test_outcomes.csv', index=False)
train_mapping.to_csv('./csv_dir/train_mapping.csv', index=False)
test_mapping.to_csv('./csv_dir/test_mapping.csv', index=False)

In [10]:
holdout_mapping = pd.read_csv('/home/ngsci/datasets/brca-psj-path/contest-phase-2/slide-manifest-holdout.csv')

# path_prefix = '/home/ngsci/datasets/brca-psj-path/contest-phase-2/png-downsampled-holdout/'
path_prefix = '/home/ngsci/datasets/brca-psj-path/contest-phase-2/clam-preprocessing-holdout/resnet50-features/pt_files/'
holdout_mapping['downsampled_path'] = path_prefix + holdout_mapping['slide_id'] + '.pt'

check_res = holdout_mapping['downsampled_path'].map(lambda x: Path(x).is_file())
assert(check_res.sum() == len(check_res))
print("Holdout slides:", len(check_res))

Holdout slides: 14466


In [11]:
paths = list(Path(path_prefix).iterdir())
for idx in tqdm(range(0, len(paths))):
    path = paths[idx]
    slide_id = str(path).split('/')[-1]
    feat = torch.load(path)
    x_mean = torch.mean(feat, dim=0).unsqueeze(dim=0)
    x_sampled = random_sample(feat, k=16)
    # x2 = torch.max(feat, dim=0).values
    # x3 = torch.min(feat, dim=0).values
    # x = torch.stack([x1,x2,x3])
    x = torch.vstack([x_mean, x_sampled])
    torch.save(x, f'./datasets/holdout/{slide_id}')

100%|██████████| 14466/14466 [3:30:02<00:00,  1.15it/s]  


In [12]:
holdout_mapping.to_csv('./csv_dir/holdout_mapping.csv', index=False)