In [1]:
!pip install -q efficientnet
import efficientnet.tfkeras as efn

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
from kaggle_datasets import KaggleDatasets
from keras.preprocessing import image
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import math
import tensorflow.keras.backend as K
import re
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

In [3]:
DEVICE = "TPU"
PATH = "../input/siim-isic-melanoma-classification"

In [4]:
if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

connecting to TPU...
Running on TPU  grpc://10.0.0.2:8470
initializing  TPU ...
TPU initialized
REPLICAS: 8


In [5]:
train_csv = pd.read_csv(os.path.join(PATH, "train.csv"))
test_csv = pd.read_csv(os.path.join(PATH, "test.csv"))
train_csv.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


In [6]:
print("train size: ", len(train_csv))
print("test size: ", len(test_csv))
print("number of patients: ", len(train_csv.patient_id.unique()))

train size:  33126
test size:  10982
number of patients:  2056


In [7]:
print("benign_malignant: ", train_csv.benign_malignant.unique())
print("diagnosis: ", train_csv.diagnosis.unique())
print("anatom_site_general_challenge: ", train_csv.anatom_site_general_challenge.unique())
print("target: ", train_csv.target.unique())

benign_malignant:  ['benign' 'malignant']
diagnosis:  ['unknown' 'nevus' 'melanoma' 'seborrheic keratosis' 'lentigo NOS'
 'lichenoid keratosis' 'solar lentigo' 'cafe-au-lait macule'
 'atypical melanocytic proliferation']
anatom_site_general_challenge:  ['head/neck' 'upper extremity' 'lower extremity' 'torso' nan 'palms/soles'
 'oral/genital']
target:  [0 1]


In [8]:
print(len(train_csv[train_csv.benign_malignant=="benign"]))
print(len(train_csv[train_csv.target==0]))

32542
32542


In [9]:
test_csv.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge
0,ISIC_0052060,IP_3579794,male,70.0,
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity
2,ISIC_0058510,IP_7960270,female,55.0,torso
3,ISIC_0073313,IP_6375035,female,50.0,torso
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity


In [10]:
columns = train_csv.columns
for i in columns:
    print(i, train_csv[i].isnull().sum())

image_name 0
patient_id 0
sex 65
age_approx 68
anatom_site_general_challenge 527
diagnosis 0
benign_malignant 0
target 0


In [11]:
columns = test_csv.columns
for i in columns:
    print(i, test_csv[i].isnull().sum())

image_name 0
patient_id 0
sex 0
age_approx 0
anatom_site_general_challenge 351


In [12]:
age = np.concatenate((train_csv.age_approx.dropna().to_numpy(), test_csv.age_approx.to_numpy()), axis=None)
age_avg = float(int(np.average(age)))
train_csv.age_approx = train_csv.age_approx.fillna(age_avg)

In [13]:
print("train:\n", train_csv.sex.value_counts())
print("test:\n", test_csv.sex.value_counts())

train:
 male      17080
female    15981
Name: sex, dtype: int64
test:
 male      6255
female    4727
Name: sex, dtype: int64


In [14]:
train_csv.sex = train_csv.sex.fillna(train_csv.sex.mode()[0])
train_csv.anatom_site_general_challenge = train_csv.anatom_site_general_challenge.fillna("unknown")
test_csv.anatom_site_general_challenge = test_csv.anatom_site_general_challenge.fillna("unknown")
train_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33126 entries, 0 to 33125
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   image_name                     33126 non-null  object 
 1   patient_id                     33126 non-null  object 
 2   sex                            33126 non-null  object 
 3   age_approx                     33126 non-null  float64
 4   anatom_site_general_challenge  33126 non-null  object 
 5   diagnosis                      33126 non-null  object 
 6   benign_malignant               33126 non-null  object 
 7   target                         33126 non-null  int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 2.0+ MB


In [15]:
train_csv = pd.get_dummies(train_csv, columns=["sex", "anatom_site_general_challenge"])
train_csv = train_csv.drop(["image_name", "patient_id", "diagnosis", "benign_malignant"], axis=1)
test = pd.get_dummies(test_csv, columns=["sex", "anatom_site_general_challenge"])
test = test.drop(["image_name", "patient_id"], axis=1)
train_csv.head()

Unnamed: 0,age_approx,target,sex_female,sex_male,anatom_site_general_challenge_head/neck,anatom_site_general_challenge_lower extremity,anatom_site_general_challenge_oral/genital,anatom_site_general_challenge_palms/soles,anatom_site_general_challenge_torso,anatom_site_general_challenge_unknown,anatom_site_general_challenge_upper extremity
0,45.0,0,0,1,1,0,0,0,0,0,0
1,45.0,0,1,0,0,0,0,0,0,0,1
2,50.0,0,1,0,0,1,0,0,0,0,0
3,45.0,0,1,0,1,0,0,0,0,0,0
4,55.0,0,1,0,0,0,0,0,0,0,1


In [16]:
X_train = train_csv.drop("target", axis=1)
y_train = train_csv["target"]

In [17]:
pos = len(train_csv[train_csv.target==1])
neg = len(train_csv[train_csv.target==0])
weight = round(neg/pos, 2)
weight

55.72

In [18]:
xgb = XGBClassifier(objective='binary:logistic', n_estimators=1000, learning_rate=0.01, 
                    max_depth=5, min_child_weight=30, subsample=0.7, colsample_bytree=0.8, 
                    scale_pos_weight=weight ,n_jobs=-1)
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=5,
              min_child_weight=30, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=-1, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=55.72, subsample=0.7,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [19]:
predictions = xgb.predict_proba(test)[:, 1]
meta_df = pd.DataFrame(columns=['image_name', 'target'])
meta_df['image_name'] = test_csv['image_name']
meta_df['target'] = predictions
meta_df.to_csv('meta_sub.csv', header=True, index=False)

In [20]:
GCS_PATH1 = KaggleDatasets().get_gcs_path('melanoma-512x512')
GCS_PATH2 = KaggleDatasets().get_gcs_path('isic2019-512x512')
GCS_PATH3 = KaggleDatasets().get_gcs_path('malignant-512x512')
GCS_PATH4 = KaggleDatasets().get_gcs_path('malignant-v2-512x512')
TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH1 + '/train*.tfrec')
TRAINING_FILENAMES += tf.io.gfile.glob(GCS_PATH2 + '/train*.tfrec')
TRAINING_FILENAMES += tf.io.gfile.glob(GCS_PATH3 + '/train*.tfrec')
TRAINING_FILENAMES += tf.io.gfile.glob(GCS_PATH4 + '/train*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH1 + '/test*.tfrec')

In [21]:
def get_mat(rotation, shear, height_zoom, width_zoom, height_shift, width_shift):
        
    rotation = math.pi * rotation / 180.
    shear    = math.pi * shear    / 180.

    def get_3x3_mat(lst):
        return tf.reshape(tf.concat([lst],axis=0), [3,3])
    
    c1   = tf.math.cos(rotation)
    s1   = tf.math.sin(rotation)
    one  = tf.constant([1],dtype='float32')
    zero = tf.constant([0],dtype='float32')
    
    rotation_matrix = get_3x3_mat([c1,   s1,   zero, 
                                   -s1,  c1,   zero, 
                                   zero, zero, one])    
    c2 = tf.math.cos(shear)
    s2 = tf.math.sin(shear)    
    
    shear_matrix = get_3x3_mat([one,  s2,   zero, 
                                zero, c2,   zero, 
                                zero, zero, one])        
    
    zoom_matrix = get_3x3_mat([one/height_zoom, zero,           zero, 
                               zero,            one/width_zoom, zero, 
                               zero,            zero,           one])    
    
    shift_matrix = get_3x3_mat([one,  zero, height_shift, 
                                zero, one,  width_shift, 
                                zero, zero, one])
    
    return K.dot(K.dot(rotation_matrix, shear_matrix), 
                 K.dot(zoom_matrix,     shift_matrix))

In [22]:
def transform(image):    

    DIM = 256
    XDIM = DIM%2
    
    rot = 180 * tf.random.normal([1], dtype='float32')
    shr = 2.0 * tf.random.normal([1], dtype='float32') 
    h_zoom = 1.0 + tf.random.normal([1], dtype='float32') / 8.0
    w_zoom = 1.0 + tf.random.normal([1], dtype='float32') / 8.0
    h_shift = 8.0 * tf.random.normal([1], dtype='float32') 
    w_shift = 8.0 * tf.random.normal([1], dtype='float32') 

    # GET TRANSFORMATION MATRIX
    m = get_mat(rot,shr,h_zoom,w_zoom,h_shift,w_shift) 

    # LIST DESTINATION PIXEL INDICES
    x   = tf.repeat(tf.range(DIM//2, -DIM//2,-1), DIM)
    y   = tf.tile(tf.range(-DIM//2, DIM//2), [DIM])
    z   = tf.ones([DIM*DIM], dtype='int32')
    idx = tf.stack( [x,y,z] )
    
    # ROTATE DESTINATION PIXELS ONTO ORIGIN PIXELS
    idx2 = K.dot(m, tf.cast(idx, dtype='float32'))
    idx2 = K.cast(idx2, dtype='int32')
    idx2 = K.clip(idx2, -DIM//2+XDIM+1, DIM//2)
    # FIND ORIGIN PIXEL VALUES           
    idx3 = tf.stack([DIM//2-idx2[0,], DIM//2-1+idx2[1,]])
    d    = tf.gather_nd(image, tf.transpose(idx3))
        
    return tf.reshape(d,[DIM, DIM,3])

In [23]:
def read_labeled_tfrecord(example):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_name'                   : tf.io.FixedLenFeature([], tf.string),
        'patient_id'                   : tf.io.FixedLenFeature([], tf.int64),
        'sex'                          : tf.io.FixedLenFeature([], tf.int64),
        'age_approx'                   : tf.io.FixedLenFeature([], tf.int64),
        'anatom_site_general_challenge': tf.io.FixedLenFeature([], tf.int64),
        'diagnosis'                    : tf.io.FixedLenFeature([], tf.int64),
        'target'                       : tf.io.FixedLenFeature([], tf.int64)
    }           
    example = tf.io.parse_single_example(example, tfrec_format)
    return example['image'], example['target']

In [24]:
def read_unlabeled_tfrecord(example, return_image_name):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_name'                   : tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_single_example(example, tfrec_format)
    return example['image'], example['image_name'] if return_image_name else 0

In [25]:
def prepare_image(img, augment=True):    
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [256, 256])
    img = tf.cast(img, tf.float32) / 255.0
    
    if augment:
        img = transform(img)
        img = tf.image.random_crop(img, [250, 250, 3])
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_hue(img, 0.01)
        img = tf.image.random_saturation(img, 0.7, 1.3)
        img = tf.image.random_contrast(img, 0.8, 1.2)
        img = tf.image.random_brightness(img, 0.1)

    else:
        img = tf.image.central_crop(img, 250 / 250)
                                   
    img = tf.image.resize(img, [224, 224])
    img = tf.reshape(img, [224, 224, 3])
    return img

In [26]:
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [27]:
def get_dataset(files, augment = False, shuffle = False, repeat = False, 
                labeled=True, return_image_names=True):
    
    ds = tf.data.TFRecordDataset(files, num_parallel_reads=AUTO)
    ds = ds.cache()
    
    if repeat:
        ds = ds.repeat()
    
    if shuffle: 
        ds = ds.shuffle(1024*8)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
        
    if labeled: 
        ds = ds.map(read_labeled_tfrecord, num_parallel_calls=AUTO)
    else:
        ds = ds.map(lambda example: read_unlabeled_tfrecord(example, return_image_names), 
                    num_parallel_calls=AUTO)      
    
    ds = ds.map(lambda img, imgname_or_label: (prepare_image(img, augment=augment), 
                                               imgname_or_label), 
                num_parallel_calls=AUTO)
    
    ds = ds.batch(16 * REPLICAS)
    ds = ds.prefetch(AUTO)
    return ds

In [28]:
def lr_callback():
    lr_start   = 0.000005
    lr_max     = 0.000020 * strategy.num_replicas_in_sync
    lr_min     = 0.000001
    lr_ramp_ep = 5
    lr_sus_ep  = 0
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

In [29]:
def build_model():
    inputs = tf.keras.Input(shape=(224, 224, 3))
    dummy = tf.keras.layers.Lambda(lambda x:x)(inputs)
    outputs = []    
    for i in range(8):
        base = getattr(efn, f'EfficientNetB{i}')
        x = base(include_top=False, weights='imagenet', input_shape=(224, 224, 3))(dummy)
        x = keras.layers.GlobalAveragePooling2D()(x)
        x = keras.layers.Dropout(0.2)(x)
        x = keras.layers.Dense(512)(x)
        x = keras.layers.Dropout(0.2)(x)
        x = keras.layers.Dense(128)(x)
        x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
        outputs.append(x)
        
    model = tf.keras.Model(inputs, outputs)
    model.summary()
    return model

In [30]:
ds_train = get_dataset(TRAINING_FILENAMES, augment=True, shuffle=True, repeat=True)
ds_train = ds_train.map(lambda img, label: (img, tuple([label] * 8)))

steps_train = count_data_items(TRAINING_FILENAMES) / (16 * REPLICAS)

with strategy.scope():
    model = build_model()

    losses = [tf.keras.losses.BinaryCrossentropy(label_smoothing = 0.05) for i in range(8)]

    model.compile(optimizer='adam', loss=losses,
                   metrics=[tf.keras.metrics.AUC(name='auc')])
    
history = model.fit(ds_train, verbose=1, steps_per_epoch=steps_train, 
                    epochs=12, callbacks=[lr_callback()])

Downloading data from https://github.com/Callidior/keras-applications/releases/download/efficientnet/efficientnet-b0_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5
Downloading data from https://github.com/Callidior/keras-applications/releases/download/efficientnet/efficientnet-b1_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5
Downloading data from https://github.com/Callidior/keras-applications/releases/download/efficientnet/efficientnet-b2_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5
Downloading data from https://github.com/Callidior/keras-applications/releases/download/efficientnet/efficientnet-b3_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5
Downloading data from https://github.com/Callidior/keras-applications/releases/download/efficientnet/efficientnet-b4_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5
Downloading data from https://github.com/Callidior/keras-applications/releases/download/efficientnet/efficientnet-b5_weights_tf_dim_ord

In [31]:
ds_test = get_dataset(TEST_FILENAMES,labeled=False,return_image_names=False,augment=True,
            repeat=True,shuffle=False)

In [32]:
cnt_test = count_data_items(TEST_FILENAMES)
steps = cnt_test / (16 * REPLICAS) * 25
probs = model.predict(ds_test, verbose=1, steps=steps)



In [33]:
probs = np.stack(probs)
probs = probs[:,:cnt_test * 25]
probs = np.stack(np.split(probs, 25, axis=1), axis=1)
probs = np.mean(probs, axis=1)

In [34]:
ds = get_dataset(TEST_FILENAMES, augment=False, repeat=False, 
                 labeled=False, return_image_names=True)

image_names = np.array([img_name.numpy().decode("utf-8") 
                        for img, img_name in iter(ds.unbatch())])

In [35]:
for i in range(8):
    submission = pd.DataFrame(dict(
        image_name = image_names,
        target     = probs[i,:,0]))

    submission = submission.sort_values('image_name') 
    submission.to_csv(f'individual_model_{i+2}.csv', index=False)

In [36]:
submission = pd.DataFrame(dict(
    image_name = image_names,
    target     = np.mean([probs[1,:,0], probs[2,:,0], probs[3,:,0], probs[4,:,0], probs[5,:,0]
                         , probs[6,:,0], probs[7,:,0]],  axis=0)))

submission = submission.sort_values('image_name') 
submission.to_csv('blended_models_7.csv', index=False)

In [37]:
submission = pd.DataFrame(dict(
    image_name = image_names,
    target     = np.mean([probs[2,:,0], probs[3,:,0], probs[4,:,0], probs[5,:,0]
                         , probs[6,:,0], probs[7,:,0]],  axis=0)))

submission = submission.sort_values('image_name') 
submission.to_csv('blended_models_6.csv', index=False)

In [38]:
submission = pd.DataFrame(dict(
    image_name = image_names,
    target     = np.mean([probs[3,:,0], probs[4,:,0], probs[5,:,0], probs[6,:,0], probs[7,:,0]],  axis=0)))

submission = submission.sort_values('image_name') 
submission.to_csv('blended_models_5.csv', index=False)

In [39]:
submission = pd.DataFrame(dict(
    image_name = image_names,
    target     = np.mean([probs[4,:,0], probs[5,:,0], probs[6,:,0], probs[7,:,0]],  axis=0)))

submission = submission.sort_values('image_name') 
submission.to_csv('blended_models_4.csv', index=False)

In [40]:
submission = pd.DataFrame(dict(
    image_name = image_names,
    target     = np.mean(probs[:,:,0], axis=0)))

submission = submission.sort_values('image_name') 
submission.to_csv('blended_models_mean.csv', index=False)

In [41]:
for i in range(4):
    weight = 0.03*(i+1)
    sub = submission.copy()
    sub['target'] = (submission['target'] * (1-weight) + meta_df['target'] * weight)

    sub.to_csv(f'blended_with_meta_{i}.csv', index=False)

In [42]:
submission = pd.DataFrame(dict(
    image_name = image_names,
    target     = 0.12*probs[3,:,0] + 0.16*probs[4,:,0] + 0.2*probs[5,:,0] + 0.22*probs[6,:,0] + 0.27*probs[7,:,0] + 0.03*meta_df['target']))

submission = submission.sort_values('image_name') 
submission.to_csv('models_blended_1.csv', index=False)

In [43]:
submission = pd.DataFrame(dict(
    image_name = image_names,
    target     = 0.16*probs[4,:,0] + 0.22*probs[5,:,0] + 0.26*probs[6,:,0] + 0.32*probs[7,:,0] + 0.04*meta_df['target']))

submission = submission.sort_values('image_name') 
submission.to_csv('models_blended_2.csv', index=False)

In [44]:
submission = pd.DataFrame(dict(
    image_name = image_names,
    target     = 0.25*probs[5,:,0] + 0.4*probs[6,:,0] + 0.3*probs[7,:,0] + 0.05*meta_df['target']))

submission = submission.sort_values('image_name') 
submission.to_csv('models_blended_3.csv', index=False)