# start

In [2]:
PRFX = 'MelEda0624A1'
DEBUG = False

SEED = 101
DEVICE = 'cpu'; PIN_MEM = (DEVICE=='cuda')
FP16 = True

In [3]:
ls $p_cmp

MelEda0624A1.ipynb


# setup

In [5]:
from pathlib import Path
import pickle
import pandas as pd
import random
import os
import numpy as np
import datetime
from collections import defaultdict, Counter

def dtnow(): return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

import torch
device=torch.device(DEVICE)
import torch.nn as nn
import torch.nn.functional as F

from efficientnet_pytorch import EfficientNet

from apex import amp

def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


In [6]:
p_cmp = '../input/siim-isic-melanoma-classification/'
# p_out=f'../output/{PRFX}'; Path(p_out).mkdir(exist_ok=True,parents=True)

# eda

In [6]:
train,test,sample_submission = (pd.read_csv(o) for o in [f'{p_cmp}/{o}.csv' 
                                                         for o in ('train', 'test', 'sample_submission')])

[o.shape for o in (train,test,sample_submission)]

[(33126, 8), (10982, 5), (10982, 2)]

In [22]:
train.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


In [23]:
test.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge
0,ISIC_0052060,IP_3579794,male,70.0,
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity
2,ISIC_0058510,IP_7960270,female,55.0,torso
3,ISIC_0073313,IP_6375035,female,50.0,torso
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity


## target

In [57]:
train.target.mean(), len(train), train.target.sum()

(0.01762965646320111, 33126, 584)

In [33]:
train.groupby('target').diagnosis.value_counts()

target  diagnosis                         
0       unknown                               27124
        nevus                                  5193
        seborrheic keratosis                    135
        lentigo NOS                              44
        lichenoid keratosis                      37
        solar lentigo                             7
        atypical melanocytic proliferation        1
        cafe-au-lait macule                       1
1       melanoma                                584
Name: diagnosis, dtype: int64

In [59]:
train[train.diagnosis=='melanoma'].target.value_counts()

1    584
Name: target, dtype: int64

In [60]:
train[train.diagnosis=='melanoma'].benign_malignant.value_counts()

malignant    584
Name: benign_malignant, dtype: int64

In [61]:
train[train.benign_malignant=='malignant'].target.value_counts()

1    584
Name: target, dtype: int64

In [62]:
train.groupby('target').benign_malignant.value_counts()

target  benign_malignant
0       benign              32542
1       malignant             584
Name: benign_malignant, dtype: int64

## features

In [66]:
train.sex.value_counts()

male      17080
female    15981
Name: sex, dtype: int64

In [65]:
test.sex.value_counts()

male      6255
female    4727
Name: sex, dtype: int64

In [67]:
train.age_approx.value_counts()

45.0    4466
50.0    4270
55.0    3824
40.0    3576
60.0    3240
35.0    2850
65.0    2527
30.0    2358
70.0    1968
25.0    1544
75.0     981
20.0     655
80.0     419
85.0     149
15.0     132
90.0      80
10.0      17
0.0        2
Name: age_approx, dtype: int64

In [68]:
test.age_approx.value_counts()

45.0    1587
40.0    1452
55.0    1265
50.0    1230
35.0    1032
60.0    1032
70.0     815
65.0     756
30.0     702
25.0     298
80.0     207
75.0     201
20.0     187
85.0     156
15.0      41
90.0      14
10.0       7
Name: age_approx, dtype: int64

In [63]:
train.anatom_site_general_challenge.value_counts()

torso              16845
lower extremity     8417
upper extremity     4983
head/neck           1855
palms/soles          375
oral/genital         124
Name: anatom_site_general_challenge, dtype: int64

In [64]:
test.anatom_site_general_challenge.value_counts()

torso              5847
lower extremity    2501
upper extremity    1573
head/neck           576
palms/soles         108
oral/genital         26
Name: anatom_site_general_challenge, dtype: int64

## patient

In [42]:
train.patient_id.nunique(), len(train), train.patient_id.value_counts().mean()

(2056, 33126, 16.111867704280154)

In [43]:
test.patient_id.nunique(), len(test), test.patient_id.value_counts().mean()

(690, 10982, 15.915942028985507)

In [45]:
train.patient_id.value_counts().value_counts().sort_index()

2        1
3      296
4      188
5      120
6       85
      ... 
101      1
102      5
104      1
114      1
115      4
Name: patient_id, Length: 84, dtype: int64

In [46]:
test.patient_id.value_counts().value_counts().sort_index()

3      57
4      64
5      50
6      43
7      38
       ..
80      1
90      1
93      1
108     1
240     1
Name: patient_id, Length: 63, dtype: int64

In [55]:
train.groupby('patient_id').target.mean().value_counts().sort_index()

0.000000    1628
0.008696       1
0.009804       1
0.009901       1
0.011765       1
            ... 
0.625000       1
0.666667       9
0.750000       2
0.800000       1
1.000000       1
Name: target, Length: 86, dtype: int64

In [51]:
train.groupby('patient_id').diagnois.mean().value_counts()

0.000000    1628
0.333333      54
0.250000      35
0.200000      26
0.142857      20
            ... 
0.012048       1
0.023256       1
0.600000       1
0.064516       1
0.105263       1
Name: target, Length: 86, dtype: int64

### one patient

In [89]:
pid = np.random.choice(train.patient_id)
train[train.patient_id==pid].sort_values('image_name')

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
779,ISIC_0338723,IP_0639877,male,55.0,torso,unknown,benign,0
1185,ISIC_0458106,IP_0639877,male,50.0,oral/genital,unknown,benign,0
1242,ISIC_0475746,IP_0639877,male,35.0,torso,unknown,benign,0
1851,ISIC_0655316,IP_0639877,male,35.0,torso,unknown,benign,0
2848,ISIC_0951791,IP_0639877,male,45.0,lower extremity,unknown,benign,0
3596,ISIC_1185497,IP_0639877,male,50.0,upper extremity,unknown,benign,0
5048,ISIC_1617139,IP_0639877,male,50.0,torso,unknown,benign,0
5732,ISIC_1818920,IP_0639877,male,50.0,upper extremity,unknown,benign,0
6198,ISIC_1953293,IP_0639877,male,45.0,torso,unknown,benign,0
6412,ISIC_2012509,IP_0639877,male,45.0,lower extremity,unknown,benign,0


## Missing values 
http://localhost:8888/notebooks/Google%20Drive/work/K/mel/don-t-turn-into-a-smoothie-after-the-shake-up.ipynb

In [69]:
missing_vals_train = train.isnull().sum() / train.shape[0]
missing_vals_train[missing_vals_train > 0].sort_values(ascending=False)

anatom_site_general_challenge    0.015909
age_approx                       0.002053
sex                              0.001962
dtype: float64

In [70]:
missing_vals_test = test.isnull().sum() / test.shape[0]
missing_vals_test[missing_vals_test > 0].sort_values(ascending=False)

anatom_site_general_challenge    0.031961
dtype: float64

# images