In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

print(df_train.shape, df_test.shape)
df_train.head()

(33126, 8) (10982, 5)


Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


In [3]:
print(f"{df_train.patient_id.nunique()} patients in the training")
print(f"{df_test.patient_id.nunique()} patients in the testing")

2056 patients in the training
690 patients in the testing


In [4]:
# create folds
df_train["kfold"] = -1    
fold_1 = np.load("folds_idx/val_idx_fold_1.npy")
fold_2 = np.load("folds_idx/val_idx_fold_2.npy")
fold_3 = np.load("folds_idx/val_idx_fold_3.npy")
fold_4 = np.load("folds_idx/val_idx_fold_4.npy")
fold_5 = np.load("folds_idx/val_idx_fold_5.npy")

df_train.loc[fold_1, "kfold"] = 0
df_train.loc[fold_2, "kfold"] = 1
df_train.loc[fold_3, "kfold"] = 2
df_train.loc[fold_4, "kfold"] = 3
df_train.loc[fold_5, "kfold"] = 4

In [5]:
print(df_train.shape)
print(df_train.kfold.unique())
df_train.head()

(33126, 9)
[2 0 4 1 3]


Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,kfold
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,2
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,4
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,4


In [6]:
duplicate = pd.read_csv("../input/2020_Challenge_duplicates.csv")
duplicate_image_names = duplicate[duplicate.partition == "train"].ISIC_id_paired.values
df_train.loc[df_train.image_name.isin(duplicate_image_names), "kfold"] = -1

In [7]:
print(df_train.shape)
print(df_train.kfold.unique())
df_train.head()

(33126, 9)
[ 2  0  4  1  3 -1]


Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,kfold
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,2
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,4
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,4


In [8]:
print("folder 0: unique patient # is", df_train[df_train.kfold == 0].patient_id.nunique())
print("folder 1: unique patient # is", df_train[df_train.kfold == 1].patient_id.nunique())
print("folder 2: unique patient # is", df_train[df_train.kfold == 2].patient_id.nunique())
print("folder 3: unique patient # is", df_train[df_train.kfold == 3].patient_id.nunique())
print("folder 4: unique patient # is", df_train[df_train.kfold == 4].patient_id.nunique())

folder 0: unique patient # is 410
folder 1: unique patient # is 413
folder 2: unique patient # is 414
folder 3: unique patient # is 411
folder 4: unique patient # is 408


In [9]:
print("folder 0: false rate is", round(df_train[df_train.kfold == 0].target.mean() * 100, 2), "%")
print("folder 1: false rate is", round(df_train[df_train.kfold == 1].target.mean() * 100, 2), "%")
print("folder 2: false rate is", round(df_train[df_train.kfold == 2].target.mean() * 100, 2), "%")
print("folder 3: false rate is", round(df_train[df_train.kfold == 3].target.mean() * 100, 2), "%")
print("folder 4: false rate is", round(df_train[df_train.kfold == 4].target.mean() * 100, 2), "%")

folder 0: false rate is 1.77 %
folder 1: false rate is 1.78 %
folder 2: false rate is 1.77 %
folder 3: false rate is 1.77 %
folder 4: false rate is 1.8 %


In [10]:
df_train = df_train[df_train.kfold != -1]

In [11]:
print(df_train.shape)
df_train.head()

(32701, 9)


Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,kfold
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,2
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,4
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,4


In [12]:
# df_train.to_csv("../input/train_folds.csv", index=False)