In [17]:
!nvidia-smi

Sat Jul 20 23:13:06 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.56       Driver Version: 418.56       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   48C    P0    42W / 300W |  14157MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

# params

In [2]:
SEED = 111

# setup

In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
from collections import Counter

import random 
import numpy as np
import torch
import os



def set_torch_seed(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) 
        torch.backends.cudnn.deterministic = True 
        torch.backends.cudnn.benchmark = False


# prep

In [18]:
img2grd = []

p = '../input/aptos2019-blindness-detection'
pp = Path(p)
train = pd.read_csv(pp/'train.csv')
test  = pd.read_csv(pp/'test.csv')
len_blnd = len(train)
len_blnd_test = len(test)

img2grd_blnd = [(f'{p}/train_images/{o[0]}.png',o[1],'blnd')  for o in train.values]

len_blnd, len_blnd_test

(3662, 1928)

In [19]:
img2grd += img2grd_blnd
display(len(img2grd))
cnt = Counter(o[1] for o in img2grd)
display(cnt.most_common())
sm = sum(cnt.values())
display([(o[0], o[1]/sm) for o in cnt.most_common()])

3662

[(0, 1805), (2, 999), (1, 370), (4, 295), (3, 193)]

[(0, 0.4929000546149645),
 (2, 0.272801747678864),
 (1, 0.1010376843255052),
 (4, 0.08055707263790278),
 (3, 0.052703440742763515)]

In [20]:
p = '../input/diabetic-retinopathy-detection'
pp = Path(p)

train=pd.read_csv(pp/'trainLabels.csv')
img2grd_diab_train=[(f'../input/diabetic-retinopathy-detection/train_images/{o[0]}.jpeg',o[1],'diab')  for o in train.values]
img2grd += img2grd_diab_train
display(len(img2grd))
display(Counter(o[1] for o in img2grd).most_common())

test=pd.read_csv(pp/'retinopathy_solution.csv')
img2grd_diab_test=[(f'../input/diabetic-retinopathy-detection/test_images/{o[0]}.jpeg',o[1],'diab')  for o in test.values]
img2grd += img2grd_diab_test
display(len(img2grd))
display(Counter(o[1] for o in img2grd).most_common())


38788

[(0, 27615), (2, 6291), (1, 2813), (3, 1066), (4, 1003)]

92364

[(0, 67148), (2, 14152), (1, 6575), (3, 2280), (4, 2209)]

In [21]:
p = '../input/IDRID/B. Disease Grading'
pp = Path(p)

train=pd.read_csv(pp/'2. Groundtruths/a. IDRiD_Disease Grading_Training Labels.csv')
img2grd_idrid_train=[(f'../input/IDRID/B. Disease Grading/1. Original Images/a. Training Set/{o[0]}.jpg',o[1],'idrid')  for o in train.values]
img2grd += img2grd_idrid_train
display(len(img2grd))
display(Counter(o[1] for o in img2grd).most_common())

test=pd.read_csv(pp/'2. Groundtruths/b. IDRiD_Disease Grading_Testing Labels.csv')
img2grd_idrid_test=[(f'../input/IDRID/B. Disease Grading/1. Original Images/b. Testing Set/{o[0]}.jpg',o[1],'idrid')  for o in test.values]
img2grd += img2grd_idrid_test
display(len(img2grd))
display(Counter(o[1] for o in img2grd).most_common())

92777

[(0, 67282), (2, 14288), (1, 6595), (3, 2354), (4, 2258)]

92880

[(0, 67316), (2, 14320), (1, 6600), (3, 2373), (4, 2271)]

In [22]:
if not np.all([Path(o[0]).exists() for o in img2grd]): print('Some files are missing!!!')

In [23]:
df = pd.DataFrame(img2grd)
df.columns = ['fnm', 'target', 'src']

df.shape

(92880, 3)

In [24]:
df.src.value_counts()

diab     88702
blnd      3662
idrid      516
Name: src, dtype: int64

In [25]:
set_torch_seed()
idx_blnd_train = np.where(df.fnm.str.contains('aptos2019-blindness-detection/train_images'))[0]
idx_val = np.random.choice(idx_blnd_train, int(len_blnd*0.10), replace=False)
df['is_val']=False
df.loc[idx_val, 'is_val']=True

In [26]:
df.src.value_counts()

diab     88702
blnd      3662
idrid      516
Name: src, dtype: int64

In [27]:
df.target.value_counts()

0    67316
2    14320
1     6600
3     2373
4     2271
Name: target, dtype: int64

In [29]:
df[df.src=='blnd'].target.value_counts()

0    1805
2     999
1     370
4     295
3     193
Name: target, dtype: int64

In [31]:
t2cnt_blnd = dict(df[df.src=='blnd'].target.value_counts())
t2cnt      = dict(df.target.value_counts())

In [32]:
t2cnt_blnd

{0: 1805, 2: 999, 1: 370, 4: 295, 3: 193}

In [38]:
t2cnt_gap = {k:min(t2cnt.values())-v for k,v in t2cnt_blnd.items()}

t2cnt_gap

{0: 466, 2: 1272, 1: 1901, 4: 1976, 3: 2078}

In [46]:
set_torch_seed()
df2use = df[df.src=='blnd'].copy()
for t,v in t2cnt_gap.items():
    df2use = pd.concat([df2use, 
                        df[(df.target==t) & (df.src!='blnd')].sample(v)])

In [47]:
df2use.head()

Unnamed: 0,fnm,target,src,is_val
0,../input/aptos2019-blindness-detection/train_i...,2,blnd,False
1,../input/aptos2019-blindness-detection/train_i...,4,blnd,False
2,../input/aptos2019-blindness-detection/train_i...,1,blnd,False
3,../input/aptos2019-blindness-detection/train_i...,0,blnd,False
4,../input/aptos2019-blindness-detection/train_i...,0,blnd,False


In [48]:
df2use.target.value_counts()

4    2271
3    2271
2    2271
1    2271
0    2271
Name: target, dtype: int64

In [49]:
df2use.src.value_counts()

diab     7516
blnd     3662
idrid     177
Name: src, dtype: int64