In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
from sklearn import preprocessing

df_train = pd.read_csv("../input/train_folds.csv")
df_test = pd.read_csv("../input/test.csv")
print(f"train shape is {df_train.shape}, test shape is {df_test.shape}")
df_train.head()

train shape is (33126, 9), test shape is (10982, 5)


Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,kfold
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,2
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,4
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,4


In [2]:
df_train.drop(["patient_id", "diagnosis", "benign_malignant"],
              axis=1,
              inplace=True)

print(f"train shape is {df_train.shape}, test shape is {df_test.shape}")
df_train.head()

train shape is (33126, 6), test shape is (10982, 5)


Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target,kfold
0,ISIC_2637011,male,45.0,head/neck,0,2
1,ISIC_0015719,female,45.0,upper extremity,0,0
2,ISIC_0052212,female,50.0,lower extremity,0,0
3,ISIC_0068279,female,45.0,head/neck,0,4
4,ISIC_0074268,female,55.0,upper extremity,0,4


In [3]:
df_train_external = pd.read_csv("../input/external_train.csv")

df_train_external["kfold"] = df_train_external["tfrecord"].apply(lambda x: 2018 if x % 2 == 0 else 2019)

df_train_external.drop([
    "patient_id", "diagnosis", "tfrecord", "benign_malignant", "width",
    "height"
],
                       axis=1,
                       inplace=True)

print(f"external train shape is {df_train_external.shape}")
df_train_external.head()

external train shape is (25331, 6)


Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target,kfold
0,ISIC_0000000,female,55.0,anterior torso,0,2018
1,ISIC_0000001,female,30.0,anterior torso,0,2018
2,ISIC_0000002,female,60.0,upper extremity,1,2018
3,ISIC_0000003,male,30.0,upper extremity,0,2018
4,ISIC_0000004,male,80.0,posterior torso,1,2018


In [4]:
anatom_map = {
    "anterior torso": "torso",
    "posterior torso": "torso",
    "lateral torso": "torso",
    'head/neck': 'head/neck',
    'upper extremity': 'upper extremity',
    'lower extremity': 'lower extremity',
    'palms/soles': 'palms/soles',
    'oral/genital': 'oral/genital'
}
df_train_external["anatom_site_general_challenge"] = df_train_external.anatom_site_general_challenge.map(anatom_map)

print(f"external train shape is {df_train_external.shape}")
df_train_external.head()

external train shape is (25331, 6)


Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target,kfold
0,ISIC_0000000,female,55.0,torso,0,2018
1,ISIC_0000001,female,30.0,torso,0,2018
2,ISIC_0000002,female,60.0,upper extremity,1,2018
3,ISIC_0000003,male,30.0,upper extremity,0,2018
4,ISIC_0000004,male,80.0,torso,1,2018


In [5]:
df_train_malig = pd.read_csv("../input/external_malig/train_malig_2.csv")

df_train_malig.drop([
    "patient_id", "diagnosis", "tfrecord", "benign_malignant", "width",
    "height", "source", "ext"
],
                       axis=1,
                       inplace=True)
df_train_malig["anatom_site_general_challenge"] = df_train_malig.anatom_site_general_challenge.map(anatom_map)
df_train_malig["kfold"] = 100
print(df_train_malig.shape)
df_train_malig.head()

(580, 6)


Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target,kfold
0,ISIC_0000070,male,25.0,torso,1,100
1,ISIC_0000076,male,55.0,torso,1,100
2,ISIC_0000144,,,,1,100
3,ISIC_0000158,female,30.0,upper extremity,1,100
4,ISIC_0000284,male,80.0,head/neck,1,100


In [6]:
df_train = pd.concat([df_train, df_train_external, df_train_malig], axis=0).reset_index(drop=True)
df_train.drop(df_train[df_train.kfold == -1].index, inplace=True)
df_train.reset_index(drop=True, inplace=True)
print(df_train.shape)
df_train.head()

(58612, 6)


Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target,kfold
0,ISIC_2637011,male,45.0,head/neck,0,2
1,ISIC_0015719,female,45.0,upper extremity,0,0
2,ISIC_0052212,female,50.0,lower extremity,0,0
3,ISIC_0068279,female,45.0,head/neck,0,4
4,ISIC_0074268,female,55.0,upper extremity,0,4


In [7]:
### SEX
df_train["sex"].fillna("unknown", inplace=True)
df_test["sex"].fillna("unknown", inplace=True)

sex_enc = preprocessing.OneHotEncoder()
sex_enc.fit(df_train["sex"].values.reshape([-1, 1]))

sex_cols = [ f"sex_{i}" for i in range(len(sex_enc.categories_[0]))]
for col in sex_cols:
    df_train[col] = 0
    df_test[col] = 0

df_train[sex_cols] = sex_enc.transform(df_train["sex"].values.reshape([-1, 1])).toarray().astype(int)
df_test[sex_cols] = sex_enc.transform(df_test["sex"].values.reshape([-1, 1])).toarray().astype(int)

print(f"train shape is {df_train.shape}, test shape is {df_test.shape}")
df_train.head()

train shape is (58612, 9), test shape is (10982, 8)


Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target,kfold,sex_0,sex_1,sex_2
0,ISIC_2637011,male,45.0,head/neck,0,2,0,1,0
1,ISIC_0015719,female,45.0,upper extremity,0,0,1,0,0
2,ISIC_0052212,female,50.0,lower extremity,0,0,1,0,0
3,ISIC_0068279,female,45.0,head/neck,0,4,1,0,0
4,ISIC_0074268,female,55.0,upper extremity,0,4,1,0,0


In [8]:
### AGE
AGE_FILL = 101
df_train["age_approx"].fillna(AGE_FILL, inplace=True)
df_test["age_approx"].fillna(AGE_FILL, inplace=True)

est = preprocessing.KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
est.fit(df_train["age_approx"].values.reshape([-1, 1]))
print("Bin edge:", est.bin_edges_[0])

df_train["age_bin"] = est.transform(df_train["age_approx"].values.reshape([-1, 1])).astype(int)
df_test["age_bin"] = est.transform(df_test["age_approx"].values.reshape([-1, 1])).astype(int)

age_cols = [ f"age_{i}" for i in range(est.n_bins)]
for col in age_cols:
    df_train[col] = 0
    df_test[col] = 0
    
age_enc = preprocessing.OneHotEncoder()
age_enc.fit(df_train["age_bin"].values.reshape([-1, 1]))

df_train[age_cols] = age_enc.transform(df_train["age_bin"].values.reshape([-1, 1])).toarray().astype(int)
df_test[age_cols] = age_enc.transform(df_test["age_bin"].values.reshape([-1, 1])).toarray().astype(int)

print(f"train shape is {df_train.shape}, test shape is {df_test.shape}")
df_train.head()

Bin edge: [  0.   10.1  20.2  30.3  40.4  50.5  60.6  70.7  80.8  90.9 101. ]
train shape is (58612, 20), test shape is (10982, 19)


Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target,kfold,sex_0,sex_1,sex_2,age_bin,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,age_9
0,ISIC_2637011,male,45.0,head/neck,0,2,0,1,0,4,0,0,0,0,1,0,0,0,0,0
1,ISIC_0015719,female,45.0,upper extremity,0,0,1,0,0,4,0,0,0,0,1,0,0,0,0,0
2,ISIC_0052212,female,50.0,lower extremity,0,0,1,0,0,4,0,0,0,0,1,0,0,0,0,0
3,ISIC_0068279,female,45.0,head/neck,0,4,1,0,0,4,0,0,0,0,1,0,0,0,0,0
4,ISIC_0074268,female,55.0,upper extremity,0,4,1,0,0,5,0,0,0,0,0,1,0,0,0,0


In [9]:
# anatom_site_general_challenge
df_train["anatom_site_general_challenge"].fillna("unknown", inplace=True)
df_test["anatom_site_general_challenge"].fillna("unknown", inplace=True)

anatom_enc = preprocessing.OneHotEncoder()
anatom_enc.fit(df_train["anatom_site_general_challenge"].values.reshape([-1, 1]))

anatom_cols = [ f"anatom_{i}" for i in range(len(anatom_enc.categories_[0]))]
for col in anatom_cols:
    df_train[col] = 0
    df_test[col] = 0

df_train[anatom_cols] = anatom_enc.transform(df_train["anatom_site_general_challenge"].values.reshape([-1, 1])).toarray().astype(int)
df_test[anatom_cols] = anatom_enc.transform(df_test["anatom_site_general_challenge"].values.reshape([-1, 1])).toarray().astype(int)

print(f"train shape is {df_train.shape}, test shape is {df_test.shape}")
df_train.head()

train shape is (58612, 27), test shape is (10982, 26)


Unnamed: 0,image_name,sex,age_approx,anatom_site_general_challenge,target,kfold,sex_0,sex_1,sex_2,age_bin,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,age_9,anatom_0,anatom_1,anatom_2,anatom_3,anatom_4,anatom_5,anatom_6
0,ISIC_2637011,male,45.0,head/neck,0,2,0,1,0,4,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
1,ISIC_0015719,female,45.0,upper extremity,0,0,1,0,0,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
2,ISIC_0052212,female,50.0,lower extremity,0,0,1,0,0,4,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,ISIC_0068279,female,45.0,head/neck,0,4,1,0,0,4,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
4,ISIC_0074268,female,55.0,upper extremity,0,4,1,0,0,5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


In [10]:
use_cols = ["image_name"] + age_cols + sex_cols + anatom_cols

df_train_fe = df_train[use_cols + ["target", "kfold"]]
df_test_fe = df_test[use_cols]

In [11]:
print(f"train fe shape is {df_train_fe.shape}, test fe shape is {df_test_fe.shape}")
df_train_fe.head()

train fe shape is (58612, 23), test fe shape is (10982, 21)


Unnamed: 0,image_name,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,age_9,sex_0,sex_1,sex_2,anatom_0,anatom_1,anatom_2,anatom_3,anatom_4,anatom_5,anatom_6,target,kfold
0,ISIC_2637011,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,2
1,ISIC_0015719,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
2,ISIC_0052212,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
3,ISIC_0068279,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,4
4,ISIC_0074268,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,4


In [12]:
# df_train_fe.to_csv("../input/train_folds_fe_plus_external.csv", index=False)
# df_test_fe.to_csv("../input/test_fe.csv", index=False)