In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
from sklearn import preprocessing

df_train = pd.read_csv("../input/train_folds.csv")
df_test = pd.read_csv("../input/test.csv")
print(f"train shape is {df_train.shape}, test shape is {df_test.shape}")
df_train.head()

train shape is (33126, 9), test shape is (10982, 5)


Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,kfold
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,2
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,4
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,4


In [2]:
### SEX
df_train["sex"].fillna("unkown", inplace=True)
df_test["sex"].fillna("unkown", inplace=True)

sex_enc = preprocessing.OneHotEncoder()
sex_enc.fit(df_train["sex"].values.reshape([-1, 1]))

sex_cols = [ f"sex_{i}" for i in range(len(sex_enc.categories_[0]))]
for col in sex_cols:
    df_train[col] = 0
    df_test[col] = 0

df_train[sex_cols] = sex_enc.transform(df_train["sex"].values.reshape([-1, 1])).toarray().astype(int)
df_test[sex_cols] = sex_enc.transform(df_test["sex"].values.reshape([-1, 1])).toarray().astype(int)

print(f"train shape is {df_train.shape}, test shape is {df_test.shape}")
df_train.head()

train shape is (33126, 12), test shape is (10982, 8)


Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,kfold,sex_0,sex_1,sex_2
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,2,0,1,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,1,0,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,0,1,0,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,4,1,0,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,4,1,0,0


In [3]:
### AGE
AGE_FILL = 101
df_train["age_approx"].fillna(AGE_FILL, inplace=True)
df_test["age_approx"].fillna(AGE_FILL, inplace=True)

est = preprocessing.KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
est.fit(df_train["age_approx"].values.reshape([-1, 1]))
print("Bin edge:", est.bin_edges_[0])

df_train["age_bin"] = est.transform(df_train["age_approx"].values.reshape([-1, 1])).astype(int)
df_test["age_bin"] = est.transform(df_test["age_approx"].values.reshape([-1, 1])).astype(int)

age_cols = [ f"age_{i}" for i in range(est.n_bins)]
for col in age_cols:
    df_train[col] = 0
    df_test[col] = 0
    
age_enc = preprocessing.OneHotEncoder()
age_enc.fit(df_train["age_bin"].values.reshape([-1, 1]))

df_train[age_cols] = age_enc.transform(df_train["age_bin"].values.reshape([-1, 1])).toarray().astype(int)
df_test[age_cols] = age_enc.transform(df_test["age_bin"].values.reshape([-1, 1])).toarray().astype(int)

print(f"train shape is {df_train.shape}, test shape is {df_test.shape}")
df_train.head()

Bin edge: [  0.   10.1  20.2  30.3  40.4  50.5  60.6  70.7  80.8  90.9 101. ]
train shape is (33126, 23), test shape is (10982, 19)


Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,kfold,sex_0,sex_1,sex_2,age_bin,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,age_9
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,2,0,1,0,4,0,0,0,0,1,0,0,0,0,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,1,0,0,4,0,0,0,0,1,0,0,0,0,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,0,1,0,0,4,0,0,0,0,1,0,0,0,0,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,4,1,0,0,4,0,0,0,0,1,0,0,0,0,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,4,1,0,0,5,0,0,0,0,0,1,0,0,0,0


In [4]:
# anatom_site_general_challenge
df_train["anatom_site_general_challenge"].fillna("unkown", inplace=True)
df_test["anatom_site_general_challenge"].fillna("unkown", inplace=True)

anatom_enc = preprocessing.OneHotEncoder()
anatom_enc.fit(df_train["anatom_site_general_challenge"].values.reshape([-1, 1]))

anatom_cols = [ f"anatom_{i}" for i in range(len(anatom_enc.categories_[0]))]
for col in anatom_cols:
    df_train[col] = 0
    df_test[col] = 0

df_train[anatom_cols] = anatom_enc.transform(df_train["anatom_site_general_challenge"].values.reshape([-1, 1])).toarray().astype(int)
df_test[anatom_cols] = anatom_enc.transform(df_test["anatom_site_general_challenge"].values.reshape([-1, 1])).toarray().astype(int)

print(f"train shape is {df_train.shape}, test shape is {df_test.shape}")
df_train.head()

train shape is (33126, 30), test shape is (10982, 26)


Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,kfold,sex_0,sex_1,sex_2,age_bin,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,age_9,anatom_0,anatom_1,anatom_2,anatom_3,anatom_4,anatom_5,anatom_6
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,2,0,1,0,4,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,1,0,0,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,0,1,0,0,4,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,4,1,0,0,4,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,4,1,0,0,5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


In [5]:
use_cols = ["image_name"] + age_cols + sex_cols + anatom_cols

df_train_fe = df_train[use_cols + ["target", "kfold"]]
df_test_fe = df_test[use_cols]

In [6]:
print(f"train fe shape is {df_train_fe.shape}, test fe shape is {df_test_fe.shape}")
df_train_fe.head()

train fe shape is (33126, 23), test fe shape is (10982, 21)


Unnamed: 0,image_name,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,age_9,sex_0,sex_1,sex_2,anatom_0,anatom_1,anatom_2,anatom_3,anatom_4,anatom_5,anatom_6,target,kfold
0,ISIC_2637011,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,2
1,ISIC_0015719,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
2,ISIC_0052212,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
3,ISIC_0068279,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,4
4,ISIC_0074268,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,4


In [7]:
# df_train_fe.to_csv("../input/train_folds_fe.csv", index=False)
# df_test_fe.to_csv("../input/test_fe.csv", index=False)