In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
## Change working Directory 
os.chdir('../')

In [3]:
# Load dataset
################### 2020 ########################
df_2020_train = "Data//Raw CSV's//2020//train.csv"
df_2020_train = pd.read_csv(df_2020_train)

df_2020_test = "Data//Raw CSV's//2020//test.csv"
df_2020_test = pd.read_csv(df_2020_test)

df_2020_duplicates = "Data//Raw CSV's//2020//Duplicates//2020_Challenge_duplicates.csv"
df_2020_duplicates = pd.read_csv(df_2020_duplicates)

################### 2019 ######################
# GT = Ground Truth
df_2019_train_GT = "Data//Raw CSV's//2019//ISIC_2019_Training_GroundTruth.csv"
df_2019_train_GT = pd.read_csv(df_2019_train_GT)

# MD = MetaData
df_2019_train_MD = "Data//Raw CSV's//2019//Patient MetaData//ISIC_2019_Training_Metadata.csv"
df_2019_train_MD = pd.read_csv(df_2019_train_MD)

df_2019_test_MD = "Data//Raw CSV's//2019//Patient MetaData//ISIC_2019_Test_Metadata.csv"
df_2019_test_MD = pd.read_csv(df_2019_test_MD)


################### 2018 ######################
df_2018_valid_GT = "Data//Raw CSV's//2018//Validation//ISIC2018_Task3_Validation_GroundTruth.csv"
df_2018_valid_GT = pd.read_csv(df_2018_valid_GT)

# Transform 2020 Dataset

In [4]:
# Drop two columns fromm df_2020_train
df_2020_train.drop(['benign_malignant', 'target'], inplace=True, axis = 1)

In [5]:
# Replace Row values
df_2020_train = df_2020_train.replace({
    'diagnosis': {
        'unknown': 'UNK',
        'nevus': 'NV',
        'melanoma': 'MEL',
        'seborrheic keratosis': 'BKL',
        'lentigo NOS': 'BKL',
        'lichenoid keratosis': 'BKL',
        'solar lentigo': 'BKL',
        'cafe-au-lait macule': 'UNK',
        'atypical melanocytic proliferation': 'UNK'
    }
})

In [6]:
df_2020_train = df_2020_train.rename(columns= {
    'anatom_site_general_challenge': 'anatom_site_general',
    'image_name': 'image',
})

df_2020_test = df_2020_test.rename(columns= {
    'anatom_site_general_challenge': 'anatom_site_general',
    'image_name': 'image',
})

In [7]:
df_2020_train.diagnosis.unique()

array(['UNK', 'NV', 'MEL', 'BKL'], dtype=object)

In [8]:
df_2020_train.head()

Unnamed: 0,image,patient_id,sex,age_approx,anatom_site_general,diagnosis
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,UNK
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,UNK
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,NV
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,UNK
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,UNK


In [9]:
df_2020_train[['anatom_site_general', 'sex']] = df_2020_train[['anatom_site_general', 'sex']].fillna(value = 'unknown')
df_2020_train['age_approx'] = df_2020_train['age_approx'].fillna(value = 50)

df_2020_test['anatom_site_general'] = df_2020_test['anatom_site_general'].fillna(value = 'unknown')

In [10]:
df_2020_test.isna().sum()

image                  0
patient_id             0
sex                    0
age_approx             0
anatom_site_general    0
dtype: int64

In [11]:
df_2020_train.isna().sum()

image                  0
patient_id             0
sex                    0
age_approx             0
anatom_site_general    0
diagnosis              0
dtype: int64

# Removed Duplicates

In [12]:
df_2020_duplicates_train = df_2020_duplicates[df_2020_duplicates['partition'] == 'train']
df_2020_duplicates_test = df_2020_duplicates[df_2020_duplicates['partition'] == 'test']

df_2020_duplicates_train.reset_index(inplace=True, drop=True)
df_2020_duplicates_test.reset_index(inplace=True, drop=True)

In [13]:
df_2020_train.shape

(33126, 6)

In [14]:
df_2020_test.shape

(10982, 5)

In [15]:
remove_train = list(df_2020_duplicates_train['ISIC_id'].values)
remove_test = list(df_2020_duplicates_test['ISIC_id'].values)

In [16]:
df_2020_train = df_2020_train[~df_2020_train['image'].isin(remove_train)]
df_2020_test = df_2020_test[~df_2020_test['image'].isin(remove_test)]

In [17]:
print(df_2020_test.shape)
print(df_2020_train.shape)

(10875, 5)
(32701, 6)


In [18]:
df_2020_test.head()

Unnamed: 0,image,patient_id,sex,age_approx,anatom_site_general
0,ISIC_0052060,IP_3579794,male,70.0,unknown
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity
2,ISIC_0058510,IP_7960270,female,55.0,torso
3,ISIC_0073313,IP_6375035,female,50.0,torso
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity


# Transform 2019 Dataset
 
*Note*: 2019 Dataset is a strick Superset of 2018 dataset. Which means that all the 2018 data is included in the 2019 dataset. [Refer](https://forum.isic-archive.com/t/isic-2018-vs-isic-2019-dataset/1102)


In [19]:
df_2019_train_GT.head()

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK
0,ISIC_0000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0000001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0000002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ISIC_0000003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0000004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Convert Wide to Long Format
df_2019_train_GT_transformed = pd.melt(df_2019_train_GT, id_vars='image', var_name='diagnosis', value_name='value').\
    sort_values('image').\
    reset_index(drop=True)

# Select rows when diagnosis == 1
df_2019_train_GT_transformed = df_2019_train_GT_transformed[df_2019_train_GT_transformed['value'] == 1]
df_2019_train_GT_transformed.drop('value', inplace=True, axis = 1)
df_2019_train_GT_transformed.reset_index(inplace = True, drop = True)

In [21]:
df_2019_train_GT_transformed.head()

Unnamed: 0,image,diagnosis
0,ISIC_0000000,NV
1,ISIC_0000001,NV
2,ISIC_0000002,MEL
3,ISIC_0000003,NV
4,ISIC_0000004,MEL


In [22]:
# Combine Train metadata and groundtruth variable.
df_2019_train_GT_combined = df_2019_train_GT_transformed.merge(df_2019_train_MD, on='image')
df_2019_train_GT_combined.drop('lesion_id', inplace=True, axis = 1)

In [23]:
df_2019_train_GT_combined.head()

Unnamed: 0,image,diagnosis,age_approx,anatom_site_general,sex
0,ISIC_0000000,NV,55.0,anterior torso,female
1,ISIC_0000001,NV,30.0,anterior torso,female
2,ISIC_0000002,MEL,60.0,upper extremity,female
3,ISIC_0000003,NV,30.0,upper extremity,male
4,ISIC_0000004,MEL,80.0,posterior torso,male


In [24]:
df_2019_train_GT_combined[['anatom_site_general', 'sex']] = df_2019_train_GT_combined[['anatom_site_general', 'sex']].\
    fillna(value = 'unknown')
df_2019_train_GT_combined['age_approx'] = df_2019_train_GT_combined['age_approx'].fillna(value = 55)

df_2019_test_MD[['anatom_site_general', 'sex']] = df_2019_test_MD[['anatom_site_general', 'sex']].fillna(value = 'unknown')
df_2019_test_MD['age_approx'] = df_2019_test_MD['age_approx'].fillna(value = 60)

In [25]:
df_2019_train_GT_combined.isna().sum()

image                  0
diagnosis              0
age_approx             0
anatom_site_general    0
sex                    0
dtype: int64

In [26]:
df_2019_test_MD.isna().sum()

image                  0
age_approx             0
anatom_site_general    0
sex                    0
dtype: int64

In [27]:
df_2019_train_GT_combined.shape

(25331, 5)

In [28]:
df_2019_test_MD.shape

(8238, 4)

# Transform 2018 Dataset

In [29]:
df_2018_valid_GT

Unnamed: 0,image,MEL,NV,BCC,AKIEC,BKL,DF,VASC
0,ISIC_0034321,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0034322,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0034323,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,ISIC_0034324,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0034325,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
188,ISIC_0034519,0.0,1.0,0.0,0.0,0.0,0.0,0.0
189,ISIC_0034520,0.0,1.0,0.0,0.0,0.0,0.0,0.0
190,ISIC_0034521,0.0,0.0,0.0,0.0,1.0,0.0,0.0
191,ISIC_0034522,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Convert Wide to Long Format
df_2018_valid_GT_transformed = pd.melt(df_2018_valid_GT, id_vars='image', var_name='diagnosis', value_name='value').\
    sort_values('image').\
    reset_index(drop=True)

# Select rows when diagnosis == 1
df_2018_valid_GT_transformed = df_2018_valid_GT_transformed[df_2018_valid_GT_transformed['value'] == 1]
df_2018_valid_GT_transformed.drop('value', inplace=True, axis = 1)
df_2018_valid_GT_transformed.reset_index(inplace = True, drop = True)

In [31]:
# Replace Row values
df_2018_valid_GT_transformed = df_2018_valid_GT_transformed.replace({
    'diagnosis': {
        'BCC': 'UNK',
        'AKIEC': 'UNK',
        'DF': 'UNK',
        'VASC': 'UNK'
    }
})

In [32]:
df_2018_valid_GT_transformed.head()

Unnamed: 0,image,diagnosis
0,ISIC_0034321,NV
1,ISIC_0034322,NV
2,ISIC_0034323,UNK
3,ISIC_0034324,NV
4,ISIC_0034325,NV


# Combining Dataframe

In [33]:
df_2019_train_GT_combined.head()

Unnamed: 0,image,diagnosis,age_approx,anatom_site_general,sex
0,ISIC_0000000,NV,55.0,anterior torso,female
1,ISIC_0000001,NV,30.0,anterior torso,female
2,ISIC_0000002,MEL,60.0,upper extremity,female
3,ISIC_0000003,NV,30.0,upper extremity,male
4,ISIC_0000004,MEL,80.0,posterior torso,male


In [34]:
df_2020_train.head()

Unnamed: 0,image,patient_id,sex,age_approx,anatom_site_general,diagnosis
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,UNK
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,UNK
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,NV
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,UNK
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,UNK


In [35]:
df_2020_select = df_2020_train[['image', 'sex', 'age_approx', 'anatom_site_general', 'diagnosis']]

In [36]:
df_2019_select = df_2019_train_GT_combined[['image', 'sex', 'age_approx', 'anatom_site_general', 'diagnosis']]

In [37]:
# PD = Patient Demographic
df_2020_2019_PD = df_2020_select.append(df_2019_select, ignore_index=True)

In [38]:
df_2020_2019_PD.head()

Unnamed: 0,image,sex,age_approx,anatom_site_general,diagnosis
0,ISIC_2637011,male,45.0,head/neck,UNK
1,ISIC_0015719,female,45.0,upper extremity,UNK
2,ISIC_0052212,female,50.0,lower extremity,NV
3,ISIC_0068279,female,45.0,head/neck,UNK
4,ISIC_0074268,female,55.0,upper extremity,UNK


In [39]:
df_2020_select_2 = df_2020_train[['image', 'diagnosis']]
df_2019_select_2 = df_2019_train_GT_combined[['image', 'diagnosis']]

In [40]:
df_2020_2019_9_label = df_2020_select_2.append(df_2019_select_2, ignore_index=True)

In [41]:
# Replace Row values
df_2020_2019_4_label = df_2020_2019_9_label.replace({
    'diagnosis': {
        'BCC': 'UNK',
        'AK': 'UNK',
        'SCC': 'UNK',
        'VASC': 'UNK',
        'DF': 'UNK'
    }
})

*Note*: we are only selecting 2020 Test set as we do not have cropped images available for the 2019 dataset.

In [42]:
df_2020_test_PD = df_2020_test[['image', 'sex', 'age_approx', 'anatom_site_general']]
df_2020_test_no_PD = df_2020_test[['image']]
#df_2019_test_select = df_2019_test_MD[['image', 'sex', 'age_approx', 'anatom_site_general']]

In [43]:
# PD = Patient Demographic
#df_2020_2019_PD_test = df_2020_test_select.append(df_2019_test_select, ignore_index=True)

In [44]:
df_2020_test_PD.head()

Unnamed: 0,image,sex,age_approx,anatom_site_general
0,ISIC_0052060,male,70.0,unknown
1,ISIC_0052349,male,40.0,lower extremity
2,ISIC_0058510,female,55.0,torso
3,ISIC_0073313,female,50.0,torso
4,ISIC_0073502,female,45.0,lower extremity


In [45]:
df_2020_test_no_PD.head()

Unnamed: 0,image
0,ISIC_0052060
1,ISIC_0052349
2,ISIC_0058510
3,ISIC_0073313
4,ISIC_0073502


In [46]:
print(df_2020_test_PD.shape)
print(df_2020_test_no_PD.shape)

(10875, 4)
(10875, 1)


# Remove some of the rows from the CSV as the images are corrupted 

In [47]:
####### Test ########
# Test with Patient Demographic
#df_2020_test_PD.to_csv(save_path + 'test_2020_withPateintDetail.csv', header=True, index=False)

# Test no Patient Demographic
#df_2020_test_no_PD.to_csv(save_path + 'test_2020_no_PateintDetail.csv', header=True, index=False)

############ Train ##########
# Train save with pateint Demographic
df_2020_2019_PD = df_2020_2019_PD[df_2020_2019_PD['image'] != 'ISIC_0066580']
df_2020_2019_PD.reset_index(inplace=True, drop=True)

# Train save with no pateint Demographic and has 9 labels
#df_2020_2019_9_label.to_csv(save_path + 'train_2020_and_2019_with_9_Labels.csv', header=True, index=False)
df_2020_2019_9_label = df_2020_2019_9_label[df_2020_2019_9_label['image'] != 'ISIC_0066580']
df_2020_2019_9_label.reset_index(inplace=True, drop=True)

# Train save with no pateint Demographic and has 4 labels
#df_2020_2019_4_label.to_csv(save_path + 'train_2020_and_2019_with_4_Labels.csv', header=True, index=False)
df_2020_2019_4_label = df_2020_2019_4_label[df_2020_2019_4_label['image'] != 'ISIC_0066580']
df_2020_2019_4_label.reset_index(inplace=True, drop=True)

# Validation set
#df_2018_valid_GT_transformed.to_csv(save_path + 'valid_2018_noPateintDetail.csv', header=True, index=False)

# Save to CSV

In [48]:
df_2020_2019_9_label.shape

(58031, 2)

In [49]:
save_path = "Data//Processed CSV's//"

####### Test ########
# Test save with pateint Demographic
#df_2020_2019_PD_test.to_csv(save_path + 'test_2020_and_2019_withPateintDetail.csv', header=True, index=False)

# Test with Patient Demographic
df_2020_test_PD.to_csv(save_path + 'test_2020_withPateintDetail.csv', header=True, index=False)

# Test no Patient Demographic
df_2020_test_no_PD.to_csv(save_path + 'test_2020_no_PateintDetail.csv', header=True, index=False)

############ Train ##########
# Train save with pateint Demographic
df_2020_2019_PD.to_csv(save_path + 'train_2020_and_2019_withPateintDetail_9_labels.csv', header=True, index=False)

# Train save with no pateint Demographic and has 9 labels
df_2020_2019_9_label.to_csv(save_path + 'train_2020_and_2019_with_9_Labels.csv', header=True, index=False)

# Train save with no pateint Demographic and has 4 labels
df_2020_2019_4_label.to_csv(save_path + 'train_2020_and_2019_with_4_Labels.csv', header=True, index=False)

# Validation set
df_2018_valid_GT_transformed.to_csv(save_path + 'valid_2018_noPateintDetail.csv', header=True, index=False)