In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load data

In [62]:
# define data path
data_path = 'ISIC2024_data/'

In [63]:
path_train_meta = data_path + 'train-metadata.csv'
path_test_meta = data_path + 'test-metadata.csv'

In [64]:
# load the csv as pandas dataframe
df_train_meta = pd.read_csv(path_train_meta)
df_test_meta = pd.read_csv(path_test_meta)

  df_train_meta = pd.read_csv(path_train_meta)


In [65]:
# get shared columns
for feature in set(df_train_meta.columns).intersection(set(df_test_meta.columns)):
    print(feature)

tbp_lv_deltaA
isic_id
tbp_lv_z
age_approx
tbp_tile_type
tbp_lv_y
image_type
attribution
anatom_site_general
tbp_lv_areaMM2
tbp_lv_color_std_mean
tbp_lv_H
tbp_lv_deltaLB
tbp_lv_area_perim_ratio
clin_size_long_diam_mm
tbp_lv_symm_2axis
tbp_lv_L
tbp_lv_eccentricity
tbp_lv_deltaLBnorm
tbp_lv_Bext
tbp_lv_location_simple
tbp_lv_minorAxisMM
tbp_lv_Aext
tbp_lv_perimeterMM
tbp_lv_Cext
tbp_lv_x
tbp_lv_norm_color
tbp_lv_deltaB
sex
tbp_lv_deltaL
tbp_lv_B
tbp_lv_stdL
tbp_lv_C
tbp_lv_symm_2axis_angle
copyright_license
tbp_lv_Lext
tbp_lv_nevi_confidence
tbp_lv_stdLExt
tbp_lv_A
tbp_lv_norm_border
tbp_lv_Hext
tbp_lv_location
tbp_lv_radial_color_std_max
patient_id


We can see that `tbp_lv_deltaA = tbp_lv_A-tbp_lv_Aext`

In [66]:
def feature_engineering(df_data):
    # the input is a dataFrame that has shared columns of train and test data
    ## Color
    df_data['tbp_lv_deltaH'] = df_data['tbp_lv_H'] - df_data['tbp_lv_Hext']
    df_data['tbp_lv_deltaC'] = df_data['tbp_lv_C'] - df_data['tbp_lv_Cext']
    df_data['tbp_lv_deltastdL'] = df_data['tbp_lv_stdL'] - df_data['tbp_lv_stdLExt']
    
    ## Diameter
    df_data['average_diameter'] = (df_data['clin_size_long_diam_mm'] + df_data['tbp_lv_minorAxisMM'])/2
    
    
    ## Others
    # 3D distance of the lesion
    df_data['3D_distance'] = np.sqrt(df_data['tbp_lv_x']**2 + df_data['tbp_lv_y']**2 + df_data['tbp_lv_z']**2)
    # size contrast
    df_data['size_contrast'] = df_data['clin_size_long_diam_mm'] / df_data['tbp_lv_minorAxisMM']
    
    columns_new = [
        # Basic info
        'isic_id',
        'sex',
        'anatom_site_general',
        'tbp_lv_location_simple',
        
        'age_approx',
        
        # Shape
        'tbp_lv_perimeterMM',
        'tbp_lv_areaMM2',
        'tbp_lv_eccentricity',
        
        # Color space
        'tbp_lv_A','tbp_lv_deltaA',
        'tbp_lv_B','tbp_lv_deltaB',
        'tbp_lv_L','tbp_lv_deltaL',
        'tbp_lv_H','tbp_lv_deltaH',
        'tbp_lv_C','tbp_lv_deltaC',
        'tbp_lv_deltastdL',
        
        'tbp_lv_color_std_mean',# color irregularity
        'tbp_lv_radial_color_std_max', # color asymmetry
        'tbp_lv_deltaLBnorm',
        
        # Asymmetry
        'tbp_lv_symm_2axis',
        'tbp_lv_symm_2axis_angle',
        
        # Border irregularity
        'tbp_lv_area_perim_ratio',
        
        # Diameter
        'average_diameter',
        
        # Others
        '3D_distance',
        'size_contrast',
        'tbp_lv_nevi_confidence',
        
        
    ]
    
    df_data_new = df_data[columns_new]
    return df_data_new

Use the training data where only there is a valid lesion

In [67]:
common_columns = list(set(df_train_meta.columns).intersection(set(df_test_meta.columns)))
categorical_columns = ['sex','anatom_site_general','tbp_lv_location_simple']

In [68]:
df_train_valid_lesion = df_train_meta[df_train_meta['lesion_id'].notna()][common_columns]

In [69]:
df_train_valid_lesion.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22058 entries, 1 to 401056
Data columns (total 44 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   tbp_lv_deltaA                22058 non-null  float64
 1   isic_id                      22058 non-null  object 
 2   tbp_lv_z                     22058 non-null  float64
 3   age_approx                   21894 non-null  float64
 4   tbp_tile_type                22058 non-null  object 
 5   tbp_lv_y                     22058 non-null  float64
 6   image_type                   22058 non-null  object 
 7   attribution                  22058 non-null  object 
 8   anatom_site_general          21917 non-null  object 
 9   tbp_lv_areaMM2               22058 non-null  float64
 10  tbp_lv_color_std_mean        22058 non-null  float64
 11  tbp_lv_H                     22058 non-null  float64
 12  tbp_lv_deltaLB               22058 non-null  float64
 13  tbp_lv_area_perim_ra

In [70]:
df_train_engineered = feature_engineering(df_train_valid_lesion)

In [71]:
df_train_engineered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22058 entries, 1 to 401056
Data columns (total 29 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   isic_id                      22058 non-null  object 
 1   sex                          21467 non-null  object 
 2   anatom_site_general          21917 non-null  object 
 3   tbp_lv_location_simple       22058 non-null  object 
 4   age_approx                   21894 non-null  float64
 5   tbp_lv_perimeterMM           22058 non-null  float64
 6   tbp_lv_areaMM2               22058 non-null  float64
 7   tbp_lv_eccentricity          22058 non-null  float64
 8   tbp_lv_A                     22058 non-null  float64
 9   tbp_lv_deltaA                22058 non-null  float64
 10  tbp_lv_B                     22058 non-null  float64
 11  tbp_lv_deltaB                22058 non-null  float64
 12  tbp_lv_L                     22058 non-null  float64
 13  tbp_lv_deltaL       

In [72]:
def fill_missing_value(engineered):
    # input a already data engineered dataframe
    # fill sex missing value
    engineered['sex'] = engineered['sex'].fillna(engineered['sex'].mode()[0])
    # fill anatom_site_general missing value
    engineered['anatom_site_general'] = engineered['anatom_site_general'].fillna(engineered['anatom_site_general'].mode()[0])
    # fill age_approx missing value with mean
    engineered['age_approx'] = engineered['age_approx'].fillna(engineered['age_approx'].mean())
    
    return engineered

In [73]:
df_train_engineered_filled = fill_missing_value(df_train_engineered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  engineered['sex'] = engineered['sex'].fillna(engineered['sex'].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  engineered['anatom_site_general'] = engineered['anatom_site_general'].fillna(engineered['anatom_site_general'].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  engineere

In [74]:
df_train_engineered_filled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22058 entries, 1 to 401056
Data columns (total 29 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   isic_id                      22058 non-null  object 
 1   sex                          22058 non-null  object 
 2   anatom_site_general          22058 non-null  object 
 3   tbp_lv_location_simple       22058 non-null  object 
 4   age_approx                   22058 non-null  float64
 5   tbp_lv_perimeterMM           22058 non-null  float64
 6   tbp_lv_areaMM2               22058 non-null  float64
 7   tbp_lv_eccentricity          22058 non-null  float64
 8   tbp_lv_A                     22058 non-null  float64
 9   tbp_lv_deltaA                22058 non-null  float64
 10  tbp_lv_B                     22058 non-null  float64
 11  tbp_lv_deltaB                22058 non-null  float64
 12  tbp_lv_L                     22058 non-null  float64
 13  tbp_lv_deltaL       

In [75]:
# feature engineer the test set
df_test_engineered = feature_engineering(df_test_meta)

In [76]:
df_test_engineered

Unnamed: 0,isic_id,sex,anatom_site_general,tbp_lv_location_simple,age_approx,tbp_lv_perimeterMM,tbp_lv_areaMM2,tbp_lv_eccentricity,tbp_lv_A,tbp_lv_deltaA,...,tbp_lv_color_std_mean,tbp_lv_radial_color_std_max,tbp_lv_deltaLBnorm,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_area_perim_ratio,average_diameter,3D_distance,size_contrast,tbp_lv_nevi_confidence
0,ISIC_0015657,male,posterior torso,Torso Back,45.0,9.387248,3.846876,0.664465,22.80433,2.797056,...,0.461149,0.304827,6.843057,0.479339,20,22.90701,2.443822,1523.426592,1.234204,0.01698104
1,ISIC_0015729,female,lower extremity,Left Leg,35.0,6.340311,2.120473,0.926698,16.64867,6.990705,...,0.0,0.0,6.083388,0.42623,25,18.957821,1.776333,639.662302,2.440286,0.2107364
2,ISIC_0015740,male,posterior torso,Torso Back,65.0,8.130868,3.39651,0.894776,24.25384,4.316465,...,0.251236,0.230742,5.446997,0.366071,110,19.4644,2.340393,1307.012048,2.077873,8.052259e-13


In [94]:
df_train_engineered_filled_drop_isic = df_train_engineered_filled.drop(columns=['isic_id'])
df_test_engineered_drop_isic = df_test_engineered.drop(columns=['isic_id'])

## One hot encoding

In [78]:
from sklearn.preprocessing import OneHotEncoder

In [79]:
concat_train_submission = pd.concat([df_train_engineered_filled_drop_isic, df_test_engineered_drop_isic], axis=0)

In [80]:
concat_train_submission.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22061 entries, 1 to 2
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sex                          22061 non-null  object 
 1   anatom_site_general          22061 non-null  object 
 2   tbp_lv_location_simple       22061 non-null  object 
 3   age_approx                   22061 non-null  float64
 4   tbp_lv_perimeterMM           22061 non-null  float64
 5   tbp_lv_areaMM2               22061 non-null  float64
 6   tbp_lv_eccentricity          22061 non-null  float64
 7   tbp_lv_A                     22061 non-null  float64
 8   tbp_lv_deltaA                22061 non-null  float64
 9   tbp_lv_B                     22061 non-null  float64
 10  tbp_lv_deltaB                22061 non-null  float64
 11  tbp_lv_L                     22061 non-null  float64
 12  tbp_lv_deltaL                22061 non-null  float64
 13  tbp_lv_H                 

In [82]:
concat_train_submission[categorical_columns].info()

<class 'pandas.core.frame.DataFrame'>
Index: 22061 entries, 1 to 2
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   sex                     22061 non-null  object
 1   anatom_site_general     22061 non-null  object
 2   tbp_lv_location_simple  22061 non-null  object
dtypes: object(3)
memory usage: 689.4+ KB


In [83]:
# one hot encoding
encoder = OneHotEncoder(sparse=False)
encoder.fit(concat_train_submission[categorical_columns])

train_encoded = encoder.transform(df_train_engineered_filled_drop_isic[categorical_columns])
test_encoded = encoder.transform(df_test_engineered_drop_isic[categorical_columns])

train_encoded = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_columns))
test_encoded = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(categorical_columns))

train_encoded.index = df_train_engineered_filled_drop_isic.index
test_encoded.index = df_test_engineered_drop_isic.index




In [84]:
train_encoded

Unnamed: 0,sex_female,sex_male,anatom_site_general_anterior torso,anatom_site_general_head/neck,anatom_site_general_lower extremity,anatom_site_general_posterior torso,anatom_site_general_upper extremity,tbp_lv_location_simple_Head & Neck,tbp_lv_location_simple_Left Arm,tbp_lv_location_simple_Left Leg,tbp_lv_location_simple_Right Arm,tbp_lv_location_simple_Right Leg,tbp_lv_location_simple_Torso Back,tbp_lv_location_simple_Torso Front,tbp_lv_location_simple_Unknown
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
76,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
93,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
107,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400997,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
401019,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
401028,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
401054,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [85]:
test_encoded

Unnamed: 0,sex_female,sex_male,anatom_site_general_anterior torso,anatom_site_general_head/neck,anatom_site_general_lower extremity,anatom_site_general_posterior torso,anatom_site_general_upper extremity,tbp_lv_location_simple_Head & Neck,tbp_lv_location_simple_Left Arm,tbp_lv_location_simple_Left Leg,tbp_lv_location_simple_Right Arm,tbp_lv_location_simple_Right Leg,tbp_lv_location_simple_Torso Back,tbp_lv_location_simple_Torso Front,tbp_lv_location_simple_Unknown
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [95]:
df_train_non_categorical = df_train_engineered_filled_drop_isic.drop(columns=categorical_columns).copy()
df_test_non_categorical = df_test_engineered_drop_isic.drop(columns=categorical_columns).copy()

In [96]:
df_train_non_categorical

Unnamed: 0,age_approx,tbp_lv_perimeterMM,tbp_lv_areaMM2,tbp_lv_eccentricity,tbp_lv_A,tbp_lv_deltaA,tbp_lv_B,tbp_lv_deltaB,tbp_lv_L,tbp_lv_deltaL,...,tbp_lv_color_std_mean,tbp_lv_radial_color_std_max,tbp_lv_deltaLBnorm,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_area_perim_ratio,average_diameter,3D_distance,size_contrast,tbp_lv_nevi_confidence
1,60.0,3.354148,0.919497,0.639885,31.712570,6.347830,26.331000,1.781713,48.861520,-6.500838,...,0.000000,0.000000,4.987244,0.285714,55,12.235290,0.960959,1576.723962,1.338333,1.334303e-07
49,40.0,26.919133,31.975980,0.900678,24.237019,9.945246,28.042227,1.076805,46.763747,-25.334512,...,3.947276,3.215515,15.996656,0.144565,105,22.662001,7.257971,1073.046060,1.946836,9.964820e+01
76,50.0,22.729299,28.617001,0.338695,19.600317,2.792177,29.830666,-1.369085,56.975435,-11.353676,...,1.522630,1.807261,7.462467,0.160248,10,18.052941,6.183920,1377.119802,1.068948,2.394399e+01
93,40.0,9.340242,5.948583,0.771271,18.007100,7.260954,24.833930,2.673692,35.715100,-13.454040,...,1.255051,1.006729,11.708370,0.190184,140,14.665700,2.718808,624.503235,1.555731,9.946209e+01
107,40.0,27.874004,46.162507,0.641768,21.315824,11.781624,30.914309,1.940967,46.968562,-14.629774,...,3.170209,3.187016,10.669963,0.117188,175,16.830977,7.949130,1036.419568,1.321503,9.902058e+01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400997,40.0,12.580907,10.602364,0.368088,16.600171,8.612198,24.960904,2.458680,33.987772,-14.868723,...,2.215048,1.429700,13.053903,0.173633,15,14.928672,3.709470,404.510962,1.108288,9.979950e+01
401019,50.0,17.244140,19.703510,0.602607,12.549400,2.753092,19.351570,-0.862427,47.683380,-21.763030,...,5.393046,5.922404,14.266840,0.087591,25,15.091750,4.974337,1115.339450,1.241361,1.932614e+01
401028,50.0,21.046520,23.531620,0.690787,17.290400,7.059696,31.807810,2.683314,46.497300,-9.201123,...,1.714663,1.654323,7.489153,0.272476,140,18.823870,5.773835,594.330682,1.269736,6.467569e+01
401054,70.0,20.210836,22.893601,0.744495,22.574335,7.629668,27.663259,0.896124,36.333547,-17.008450,...,2.579861,2.328066,13.498163,0.288920,100,17.842449,5.838999,1197.740551,1.394015,9.936233e+01


## Standardization for non categorcal columns

In [97]:
from sklearn.preprocessing import StandardScaler

# train test split
from sklearn.model_selection import train_test_split

In [98]:
# get the train target
df_train_meta_target = df_train_meta[df_train_meta['lesion_id'].notna()]['target']
df_train_meta_target

1         0
49        0
76        0
93        0
107       0
         ..
400997    0
401019    0
401028    0
401054    0
401056    0
Name: target, Length: 22058, dtype: int64

In [99]:
df_train_engineered_filled_drop_isic

Unnamed: 0,sex,anatom_site_general,tbp_lv_location_simple,age_approx,tbp_lv_perimeterMM,tbp_lv_areaMM2,tbp_lv_eccentricity,tbp_lv_A,tbp_lv_deltaA,tbp_lv_B,...,tbp_lv_color_std_mean,tbp_lv_radial_color_std_max,tbp_lv_deltaLBnorm,tbp_lv_symm_2axis,tbp_lv_symm_2axis_angle,tbp_lv_area_perim_ratio,average_diameter,3D_distance,size_contrast,tbp_lv_nevi_confidence
1,male,head/neck,Head & Neck,60.0,3.354148,0.919497,0.639885,31.712570,6.347830,26.331000,...,0.000000,0.000000,4.987244,0.285714,55,12.235290,0.960959,1576.723962,1.338333,1.334303e-07
49,female,posterior torso,Torso Back,40.0,26.919133,31.975980,0.900678,24.237019,9.945246,28.042227,...,3.947276,3.215515,15.996656,0.144565,105,22.662001,7.257971,1073.046060,1.946836,9.964820e+01
76,male,upper extremity,Left Arm,50.0,22.729299,28.617001,0.338695,19.600317,2.792177,29.830666,...,1.522630,1.807261,7.462467,0.160248,10,18.052941,6.183920,1377.119802,1.068948,2.394399e+01
93,female,lower extremity,Right Leg,40.0,9.340242,5.948583,0.771271,18.007100,7.260954,24.833930,...,1.255051,1.006729,11.708370,0.190184,140,14.665700,2.718808,624.503235,1.555731,9.946209e+01
107,female,posterior torso,Torso Back,40.0,27.874004,46.162507,0.641768,21.315824,11.781624,30.914309,...,3.170209,3.187016,10.669963,0.117188,175,16.830977,7.949130,1036.419568,1.321503,9.902058e+01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400997,male,lower extremity,Left Leg,40.0,12.580907,10.602364,0.368088,16.600171,8.612198,24.960904,...,2.215048,1.429700,13.053903,0.173633,15,14.928672,3.709470,404.510962,1.108288,9.979950e+01
401019,female,anterior torso,Torso Front,50.0,17.244140,19.703510,0.602607,12.549400,2.753092,19.351570,...,5.393046,5.922404,14.266840,0.087591,25,15.091750,4.974337,1115.339450,1.241361,1.932614e+01
401028,female,lower extremity,Left Leg,50.0,21.046520,23.531620,0.690787,17.290400,7.059696,31.807810,...,1.714663,1.654323,7.489153,0.272476,140,18.823870,5.773835,594.330682,1.269736,6.467569e+01
401054,male,anterior torso,Torso Front,70.0,20.210836,22.893601,0.744495,22.574335,7.629668,27.663259,...,2.579861,2.328066,13.498163,0.288920,100,17.842449,5.838999,1197.740551,1.394015,9.936233e+01


In [100]:
# train test split from train data
X_train, X_val, y_train, y_val = train_test_split(df_train_engineered_filled_drop_isic, df_train_meta_target, test_size=0.2, random_state=42)

In [101]:
X_train_non_categorical = X_train.drop(columns=categorical_columns).copy()
X_val_non_categorical = X_val.drop(columns=categorical_columns).copy()

In [104]:
# standardization
scaler = StandardScaler()
scaler.fit(X_train_non_categorical)

X_train_non_categorical_scaled = scaler.transform(X_train_non_categorical)
X_val_non_categorical_scaled = scaler.transform(X_val_non_categorical)

X_train_non_categorical_scaled = pd.DataFrame(X_train_non_categorical_scaled, columns=X_train_non_categorical.columns)
X_val_non_categorical_scaled = pd.DataFrame(X_val_non_categorical_scaled, columns=X_val_non_categorical.columns)

X_train_non_categorical_scaled.index = X_train_non_categorical.index
X_val_non_categorical_scaled.index = X_val_non_categorical.index

In [105]:
# concat the one hot encoded and scaled non categorical columns
X_train_final = pd.concat([X_train_non_categorical_scaled, train_encoded.loc[X_train_non_categorical.index]], axis=1)
X_val_final = pd.concat([X_val_non_categorical_scaled, train_encoded.loc[X_val_non_categorical.index]], axis=1)

evaluation function

In [106]:
from sklearn.metrics import roc_auc_score
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

## Model training

In [108]:
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from xgboost import XGBClassifier

In [243]:
lgb_params = {
    'objective': 'binary',
    "random_state": 42,
    "n_estimators": 500,
    'learning_rate': 0.01,
    'bagging_freq': 1,
    'pos_bagging_fraction': 0.75,
    'neg_bagging_fraction': 0.05,
    'feature_fraction': 0.8,
    'lambda_l1': 0.25,
    'lambda_l2': 0.25,
    "verbosity": -1,
    # "extra_trees": True
}

In [244]:
lgb_model = lgb.LGBMClassifier(**lgb_params)

In [245]:
lgb_model.fit(X_train_final, y_train)

In [246]:
# predict on validation set
y_val_pred = lgb_model.predict_proba(X_val_final)[:,1]

# the score
score = comp_score(y_val, pd.DataFrame(y_val_pred, index=y_val.index, columns=['target']), '')
score

0.11379965212452518

https://stackoverflow.com/questions/45815708/what-are-different-options-for-objective-functions-available-in-xgboost-xgbclass 

In [222]:
xgb_params = {
    'objective': 'binary:logistic',
    'random_state': 42,
    'n_estimators': 500,
    'learning_rate': 0.04,
    'reg_alpha':1,
    'reg_lambda':1,
    'max_depth': 15,
    'booster':'gbtree',
    'eval_metric':'auc',
}

In [223]:
xgb_model = XGBClassifier(**xgb_params)
# xgb_model = XGBClassifier()

In [224]:
xgb_model.fit(X_train_final, y_train)

In [225]:
# predict on validation set
y_val_pred_xgb = xgb_model.predict_proba(X_val_final)[:,1]

In [226]:
# the score
score_xgb = comp_score(y_val, pd.DataFrame(y_val_pred_xgb, index=y_val.index, columns=['target']), '')
score_xgb

0.10799166992060391

## Submission

In [228]:
# fit the standard scaler on the test data non categorical columns
test_non_categorical_scaled = scaler.transform(df_test_non_categorical)
test_non_categorical_scaled = pd.DataFrame(test_non_categorical_scaled, columns=df_test_non_categorical.columns)
test_non_categorical_scaled.index = df_test_non_categorical.index

# concate with the one hot encoded test data
test_final = pd.concat([test_non_categorical_scaled, test_encoded], axis=1)

# predict on the test data
y_test_pred = lgb_model.predict_proba(test_final)[:,1]


In [229]:
# show the data
df_test_meta['target'] = y_test_pred

df_test_meta[['isic_id','target']]

Unnamed: 0,isic_id,target
0,ISIC_0015657,0.265434
1,ISIC_0015729,0.011572
2,ISIC_0015740,0.328049


In [230]:
# save the submission
df_test_meta[['isic_id','target']].to_csv('submission.csv', index=False)