## Note

Please unzip the attached `data.zip` file before running this notebook.

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.ensemble import IsolationForest

### Variable Nomenclature

> `X * m = y`
> 
> `X` is a matrix of Features
> `m` is the model
> `y` is the matrix of Target variable, diabetes_mellitus.

`X` is Labelled data  
`u` is Unlabelled data  
`y` is the Target variable. Also called `dm`.    
`Xy` is Labelled data and Target variable  
`Xu` is Labelled and Unlabelled data. Also called `df`.  

# Preprocessing

### Reading data and dropping unuseable columns
`df` dataframe has Features from Labelled and Unlabelled, `dm` has the Target variable, `diabetes_mellitus`.

In [4]:
Xy = pd.read_csv('inputs/TrainingWiDS2021.csv')  # labelled data
u  = pd.read_csv('inputs/UnlabeledWiDS2021.csv') # unlabelled data

dm = Xy['diabetes_mellitus']
X  = Xy.drop('diabetes_mellitus', axis=1)

nsamples_labelled   = X.shape[0]
nsamples_unlabelled = u.shape[0] 

df = pd.concat([X, u], ignore_index=True) # placing all samples in one dataframe

df = df.drop(columns=[
    'Unnamed: 0',         # Excel row number
    'readmission_status', # Is always 0
    'hospital_id',        # No common IDs between Labelled and Unlabelled
    'icu_id',             # No common IDs between Labelled and Unlabelled
])

print('Loaded samples from CSV.')

Loaded samples from CSV.


### Drop columns with too much missing data

In [3]:
# Age 0 is probably missing data
df.loc[df['age'] == 0, 'age'] = np.nan

# Find number of missing values per column
pq = df.isna().sum().sort_values(ascending=False)
nan_stats = list(zip(pq.keys(), pq.values))
del(pq)

max_nans_per_column = 30000
cols_allowed = [i[0] for i in filter(lambda x: x[1] < max_nans_per_column, nan_stats)]
cols_dropped = [i[0] for i in filter(lambda x: x[1] > max_nans_per_column, nan_stats)]
df = df.drop(columns=cols_dropped)

print(f'Dropped {len(cols_dropped)} columns with more than 30000 missing values. {len(cols_allowed)} remaining.')

Dropped 78 columns with more than 30000 missing values. 98 remaining.


### Encode Category columns and fix inconsistent Min/Max columns
`Strings` cannot be used for prediction, so let's replace them with a number for each category.  
Some Min/Max columns have minimum value greater than maximum. Swap those values.

In [4]:
# Encode string type categories into numbers
for column in df.columns[df.dtypes == object]:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

# If minimum is greater than maximum, swap the values
columns_min_max = [c[:-4] for c in df.columns if c[-4:]=='_min']
for col in columns_min_max:
    index_min_gt_max = df[f'{col}_min'] > df[f'{col}_max'] # indexes where min is greater than max for this column pair
    df.loc[index_min_gt_max, [f'{col}_min', f'{col}_max']] = df.loc[index_min_gt_max, [f'{col}_max', f'{col}_min']].values

print('Processed Age and Min/Max columns.')

Processed Age and Min/Max columns.


### Impute missing values

This imputation takes a lot of time, upto an hour.  
You can limit the number of iterations, or simply
load the dataset I imputed and save time.

In [5]:
load_from_cache = True

if load_from_cache:
    print('(loading from cache)')
    df = pd.read_csv('cache/imputation.csv') # loading already imputed samples to save time.
else:
    y = df['diabetes_mellitus']
    n = df['encounter_id']
    imp = IterativeImputer(random_state=0, verbose=2)
    df_imp = imp.fit_transform(
        # columns diabetes_mellitus and encounter_id should not be used
        # for imputation, so we saved them into y and n, and are dropping them here
        df.drop(['diabetes_mellitus', 'encounter_id'], axis=1)
    )

    # convert it back to pandas dataframe
    df = pd.DataFrame(df_imp, columns = df.columns)
    df = pd.concat([n, df, y], axis=1) # add back encounter_id and diabetes_mellitus

    df.to_csv('cache/imputation.csv', index=False)

print('Imputed missing values.')
display(df)

(loading from cache)
Imputed missing values.


Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,...,d1_sodium_min,d1_wbc_max,d1_wbc_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
0,68.0,22.732803,0.0,2.0,1.0,180.3,4.0,1.0,0.0,2.0,...,134.000000,14.100000,14.100000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,77.0,27.421875,0.0,2.0,0.0,160.0,4.0,1.0,0.0,5.0,...,145.000000,23.300000,12.700000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,25.0,31.952749,0.0,2.0,0.0,172.7,3.0,0.0,0.0,5.0,...,137.654238,10.862412,10.682011,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,81.0,22.635548,1.0,2.0,0.0,165.1,8.0,2.0,0.0,2.0,...,136.776912,9.000000,8.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19.0,29.577357,0.0,2.0,1.0,188.0,15.0,0.0,0.0,5.0,...,137.668405,10.662703,10.230670,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140386,36.0,37.500000,0.0,2.0,0.0,170.1,4.0,1.0,0.0,5.0,...,137.000000,11.700000,7.200000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140387,61.0,32.100000,0.0,2.0,0.0,160.0,3.0,0.0,0.0,5.0,...,139.000000,11.200000,11.200000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140388,74.0,22.700000,0.0,2.0,0.0,165.1,14.0,1.0,0.0,5.0,...,141.000000,5.500000,5.500000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140389,90.0,19.900000,0.0,2.0,0.0,160.0,3.0,0.0,0.0,5.0,...,139.000000,6.400000,6.400000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Removing outliers
We should not remove outliers from Unlabelled, so let's split the dataset.  

In [6]:
X = df[:nsamples_labelled]

isOutlier = IsolationForest(random_state=0, n_jobs=-2).fit_predict(X) == -1
indexOutlier = np.where(isOutlier)[0]

df = df.drop(indexOutlier).reset_index(drop=True)
dm = dm.drop(indexOutlier).reset_index(drop=True)

nsamples_labelled = dm.shape[0]

print(f'Removed {isOutlier.sum()} outliers from Labelled.')
display(df)
display(dm)

Removed 4255 outliers from Labelled.


Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,...,d1_sodium_min,d1_wbc_max,d1_wbc_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
0,68.0,22.732803,0.0,2.0,1.0,180.3,4.0,1.0,0.0,2.0,...,134.000000,14.100000,14.100000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,77.0,27.421875,0.0,2.0,0.0,160.0,4.0,1.0,0.0,5.0,...,145.000000,23.300000,12.700000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,25.0,31.952749,0.0,2.0,0.0,172.7,3.0,0.0,0.0,5.0,...,137.654238,10.862412,10.682011,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,81.0,22.635548,1.0,2.0,0.0,165.1,8.0,2.0,0.0,2.0,...,136.776912,9.000000,8.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19.0,29.577357,0.0,2.0,1.0,188.0,15.0,0.0,0.0,5.0,...,137.668405,10.662703,10.230670,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136131,36.0,37.500000,0.0,2.0,0.0,170.1,4.0,1.0,0.0,5.0,...,137.000000,11.700000,7.200000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136132,61.0,32.100000,0.0,2.0,0.0,160.0,3.0,0.0,0.0,5.0,...,139.000000,11.200000,11.200000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136133,74.0,22.700000,0.0,2.0,0.0,165.1,14.0,1.0,0.0,5.0,...,141.000000,5.500000,5.500000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136134,90.0,19.900000,0.0,2.0,0.0,160.0,3.0,0.0,0.0,5.0,...,139.000000,6.400000,6.400000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0         1
1         1
2         0
3         0
4         0
         ..
125897    0
125898    0
125899    1
125900    0
125901    0
Name: diabetes_mellitus, Length: 125902, dtype: int64

# Feature Engineering

### Bin features
Binning bmi, height, weight, and age.

In [7]:
features_to_bin = ['bmi', 'height', 'weight', 'age']
for feature in features_to_bin:
    df[f'{feature}_bin'] = df[feature].apply(lambda x: 5 * (round(int(x)/5)))

### Generate features
- The range of variation of some lab features.
- Whether the daily maximum / minimum is same as hourly maximum / minimum.
- Difference of a value from it's average.
- Difference of a value from it's average value of similar people, based on BMI, age, height, weight, etc, with and without diabetes.

In [8]:
load_from_cache = True

if load_from_cache:
    print('(loading from cache)')
    df = pd.read_csv('cache/feature_engineering.csv')
else:
    labs_daily  = list(set([c[3:-4] for c in df.columns if((c.startswith("d1")))]))
    labs_hourly = list(set([c[3:-4] for c in df.columns if((c.startswith("h1")))]))
    labs_only_daily  = [c for c in labs_daily if c not in labs_hourly]
    # All hourly labs are in daily, but there are daily features that are not in hourly.

    # prepending generated features with __ to differentiate them.
    # this block would not be idempotent otherwise
    for col in labs_daily:
        df[f'__d1_{col}_range'] = df[f'd1_{col}_max'] - df[f'd1_{col}_min']

    for col in labs_hourly:
        df[f'__h1_{col}_range'] = df[f'h1_{col}_max'] - df[f'h1_{col}_min']

        df[f'__h1d1_{col}_max_equal'] = (df[f'h1_{col}_max'] == df[f'd1_{col}_max']).astype(np.int8) # hourly and daily max are equal
        df[f'__h1d1_{col}_min_equal'] = (df[f'h1_{col}_min'] == df[f'd1_{col}_min']).astype(np.int8) # hourly and daily min are equal


    def bin_mean_factory(col_name, bin_name):
        # This function returns a lambda function which returns the difference of a value from it's average value.
        bin_mean_cache = {}
        for bin_value in df[bin_name].unique():
            bin_mean_cache[bin_value] = df[:nsamples_labelled].loc[df[bin_name] == bin_value, col_name].mean()

        return lambda row: (row[col_name] - bin_mean_cache[row[bin_name]])

    def bin_mean_y_factory(col_name, bin_name, y_value):
        # Same as bin_mean_factory, but separates people with and without diabetes.
        bin_mean_cache = {}
        for bin_value in df[bin_name].unique():
            bin_mean_cache[bin_value] = df[:nsamples_labelled].loc[(df[bin_name] == bin_value) & (dm == y_value), col_name].mean()

        return lambda row: (row[col_name] - bin_mean_cache[row[bin_name]])

    for col in ['bmi', 'd1_glucose_max', 'd1_glucose_min', '__d1_glucose_range']:
        print(f'Finding differences from mean for {col}')
        df[f'__{col}_diff_mean']   = df[col] - df[col].mean() # difference from overall mean
        df[f'__{col}_diff_mean_p'] = df[col] - df[:nsamples_labelled].loc[dm == 1, col].mean() # difference from overall mean for diabetes positive samples
        df[f'__{col}_diff_mean_n'] = df[col] - df[:nsamples_labelled].loc[dm == 0, col].mean() # difference from overall mean for diabetes negative samples

        for metric in ['bmi', 'age', 'height', 'weight']:
            # difference from mean of others in the same metric bin
            df[f'__{col}_diff_{metric}_bin_mean']   = df.apply(bin_mean_factory(  col, f'{metric}_bin'),    axis=1)
            df[f'__{col}_diff_{metric}_bin_mean_p'] = df.apply(bin_mean_y_factory(col, f'{metric}_bin', 1), axis=1)
            df[f'__{col}_diff_{metric}_bin_mean_n'] = df.apply(bin_mean_y_factory(col, f'{metric}_bin', 0), axis=1)

    # There are some NaN values because some bins are only in the Unlabelled set.
    # The mean is calculated from Labelled, so there is a NaN.
    # Let's replace NaNs with the column's mean, and add new columns to indicate missing data

    print('Imputing values for NaNs due to non uniform bin distribution.')
    from sklearn.impute import SimpleImputer

    imp = SimpleImputer(missing_values=np.nan, strategy='mean', add_indicator=True)

    display(df)

    # note: index is reset here, so there will be NaNs if you didn't reset index in outlier removal
    df_imp = imp.fit_transform(df)
    # also, all the column names are dropped, so we should generate them again
    columns = list(df.columns) + [i+'_isnan'for i in df.columns[imp.indicator_.features_]]
    df = pd.DataFrame(df_imp, columns=columns)
    
    df.to_csv('cache/feature_engineering.csv', index=False)

display(df)

(loading from cache)


Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,...,__d1_glucose_min_diff_weight_bin_mean_isnan,__d1_glucose_min_diff_weight_bin_mean_p_isnan,__d1_glucose_min_diff_weight_bin_mean_n_isnan,____d1_glucose_range_diff_bmi_bin_mean_isnan,____d1_glucose_range_diff_bmi_bin_mean_p_isnan,____d1_glucose_range_diff_bmi_bin_mean_n_isnan,____d1_glucose_range_diff_age_bin_mean_p_isnan,____d1_glucose_range_diff_weight_bin_mean_isnan,____d1_glucose_range_diff_weight_bin_mean_p_isnan,____d1_glucose_range_diff_weight_bin_mean_n_isnan
0,68.0,22.732803,0.0,2.0,1.0,180.3,4.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,77.0,27.421875,0.0,2.0,0.0,160.0,4.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,25.0,31.952749,0.0,2.0,0.0,172.7,3.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,81.0,22.635548,1.0,2.0,0.0,165.1,8.0,2.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19.0,29.577357,0.0,2.0,1.0,188.0,15.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136131,36.0,37.500000,0.0,2.0,0.0,170.1,4.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136132,61.0,32.100000,0.0,2.0,0.0,160.0,3.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136133,74.0,22.700000,0.0,2.0,0.0,165.1,14.0,1.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136134,90.0,19.900000,0.0,2.0,0.0,160.0,3.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Remove correlated columns

Columns with high correlation are identified using the sula method from [auto_viml](https://github.com/AutoViML/Auto_ViML/blob/master/autoviml/Auto_ViML.py).

In [9]:
from uncorr import remove_variables_using_fast_correlation as sula

load_from_cache = True

if load_from_cache:
    print('(loading from cache)')
    df = pd.read_csv('cache/uncorrelate.csv')
else:
    # Separate _isnan columns added by imputation.
    df_isnans = df[[col for col in df.columns if '_isnan' in col]]
    df = df[[col for col in df.columns if '_isnan' not in col]]

    X = df[:nsamples_labelled]
    Xy = pd.concat([X, dm], axis=1)

    target = 'diabetes_mellitus'
    numvars = [x for x in list(Xy) if x not in target]
    good_cols = sula(Xy, numvars, 'Classification', target)

    df = pd.concat([
        df[good_cols],
        df_isnans[[col for col in df_isnans.columns if col[:-6] in good_cols]]
    ], axis=1)
    
    df.to_csv('cache/uncorrelate.csv', index=False)

display(df)

(loading from cache)


Unnamed: 0.1,Unnamed: 0,ethnicity,gender,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,pre_icu_los_days,apache_2_diagnosis,arf_apache,...,d1_diasbp_min,__d1_spo2_range,d1_sysbp_min,__h1d1_mbp_max_equal,__h1_sysbp_noninvasive_range,d1_platelets_max,__h1d1_diasbp_min_equal,__bmi_diff_bmi_bin_mean_isnan,__bmi_diff_weight_bin_mean_p_isnan,__d1_glucose_min_diff_bmi_bin_mean_isnan
0,0,2.0,1.0,4.0,1.0,0.0,2.0,0.541667,113.0,0.0,...,37.0,26.0,73.0,0.0,16.000000,233.000000,0.0,0.0,0.0,0.0
1,1,2.0,0.0,4.0,1.0,0.0,5.0,0.927778,108.0,0.0,...,31.0,30.0,67.0,0.0,24.000000,557.000000,0.0,0.0,0.0,0.0
2,2,2.0,0.0,3.0,0.0,0.0,5.0,0.000694,122.0,0.0,...,48.0,7.0,105.0,0.0,24.000000,211.316676,0.0,0.0,0.0,0.0
3,3,2.0,0.0,8.0,2.0,0.0,2.0,0.000694,203.0,0.0,...,42.0,5.0,84.0,0.0,29.671463,198.000000,0.0,0.0,0.0,0.0
4,4,2.0,1.0,15.0,0.0,0.0,5.0,0.073611,119.0,0.0,...,57.0,4.0,120.0,1.0,10.000000,204.822573,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136131,136131,2.0,0.0,4.0,1.0,0.0,5.0,1.696528,305.0,0.0,...,68.0,3.0,125.0,0.0,4.000000,170.000000,0.0,0.0,0.0,0.0
136132,136132,2.0,0.0,3.0,0.0,0.0,5.0,0.033333,124.0,0.0,...,56.0,5.0,125.0,0.0,15.144588,228.000000,0.0,0.0,0.0,0.0
136133,136133,2.0,0.0,14.0,1.0,0.0,5.0,0.757639,113.0,0.0,...,49.0,7.0,97.0,0.0,10.000000,87.000000,0.0,0.0,0.0,0.0
136134,136134,2.0,0.0,3.0,0.0,0.0,5.0,0.087500,108.0,0.0,...,57.0,4.0,93.0,0.0,0.000000,297.000000,0.0,0.0,0.0,0.0


# Training

In [15]:
try:
    import lightgbm as lgb
except:
    !pip install lightgbm
    import lightgbm as lgb

In [19]:
X = df[:nsamples_labelled]
u = df[nsamples_labelled:]
y = dm

load_from_cache = True
if load_from_cache:
    n = 6926
else:
    params = {
            'boosting_type': 'goss',
            'objective': 'binary',
            'metric': 'auc',
            'learning_rate': 0.01,
            'subsample': 1,
            'colsample_bytree': 0.1,
            'reg_alpha': 3,
            'reg_lambda': 1,
            'scale_pos_weight': 1,
            'n_estimators': 30000,
            'silent': -1,
            'verbose': -1,
            'max_depth': -1
    }

    dtrain = lgb.Dataset(X, y)
    evals = lgb.cv(params,
                 dtrain,
                 nfold=5,
                 stratified=True,
                 num_boost_round=20000,
                 early_stopping_rounds=200,
                 verbose_eval=100,
                 seed = 666,
                 show_stdv=True)

    print(f'Maximum mean auc{max(evals["auc-mean"])}') # 0.8524617333454604

    n = int(1.1 * len(evals['auc-mean']))
      
print(n)

Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


[100]	cv_agg's auc: 0.788776 + 0.00237465
[200]	cv_agg's auc: 0.810903 + 0.00236507
[300]	cv_agg's auc: 0.815463 + 0.00245199
[400]	cv_agg's auc: 0.821464 + 0.00246294
[500]	cv_agg's auc: 0.82508 + 0.00250869
[600]	cv_agg's auc: 0.82936 + 0.00250942
[700]	cv_agg's auc: 0.832512 + 0.0025148
[800]	cv_agg's auc: 0.834491 + 0.00253519
[900]	cv_agg's auc: 0.836918 + 0.00251519
[1000]	cv_agg's auc: 0.838797 + 0.00251334
[1100]	cv_agg's auc: 0.840101 + 0.00255542
[1200]	cv_agg's auc: 0.841149 + 0.00255671
[1300]	cv_agg's auc: 0.841998 + 0.00258023
[1400]	cv_agg's auc: 0.842869 + 0.00254528
[1500]	cv_agg's auc: 0.843619 + 0.00261706
[1600]	cv_agg's auc: 0.844431 + 0.00266753
[1700]	cv_agg's auc: 0.844964 + 0.00262471
[1800]	cv_agg's auc: 0.845587 + 0.00265901
[1900]	cv_agg's auc: 0.846228 + 0.00266671
[2000]	cv_agg's auc: 0.846696 + 0.00265517
[2100]	cv_agg's auc: 0.847041 + 0.00263712
[2200]	cv_agg's auc: 0.847384 + 0.00268785
[2300]	cv_agg's auc: 0.847832 + 0.00271973
[2400]	cv_agg's auc: 0.

In [20]:
n_rounds = 5
predictions = np.zeros([u.shape[0], n_rounds])

for i in range(n_rounds):
    lgb_params = {
        'boosting_type': 'goss',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.01,
        'subsample': 1,
        'colsample_bytree': 0.1,
        'reg_alpha': 3,
        'reg_lambda': 1,
        'scale_pos_weight': 1,
        'n_estimators': n,
        'silent': -1,
        'verbose': -1,
        'max_depth': -1,
        'seed':i + 500,
    }
    
    clf = lgb.LGBMClassifier(**lgb_params)
    
    clf.fit(
        X, y,
        eval_set=[(X, y), (X, y)], # Note: Training and testing on same data
        verbose=100,
        early_stopping_rounds=None
    )
    
    predictions[:, i] = clf.predict_proba(u, num_iteration=clf.best_iteration_)[:, 1]
    
y_preds = np.power(np.prod(predictions, axis=1), 1/5)

[100]	training's auc: 0.820264
[200]	training's auc: 0.832511
[300]	training's auc: 0.836421
[400]	training's auc: 0.839506
[500]	training's auc: 0.844055
[600]	training's auc: 0.847375
[700]	training's auc: 0.850848
[800]	training's auc: 0.853595
[900]	training's auc: 0.856161
[1000]	training's auc: 0.858522
[1100]	training's auc: 0.860642
[1200]	training's auc: 0.862869
[1300]	training's auc: 0.865015
[1400]	training's auc: 0.866775
[1500]	training's auc: 0.86858
[1600]	training's auc: 0.870269
[1700]	training's auc: 0.871992
[1800]	training's auc: 0.873725
[1900]	training's auc: 0.875555
[2000]	training's auc: 0.877252
[2100]	training's auc: 0.878588
[2200]	training's auc: 0.87992
[2300]	training's auc: 0.881393
[2400]	training's auc: 0.882918
[2500]	training's auc: 0.88423
[2600]	training's auc: 0.885607
[2700]	training's auc: 0.886887
[2800]	training's auc: 0.888355
[2900]	training's auc: 0.889624
[3000]	training's auc: 0.890903
[3100]	training's auc: 0.89214
[3200]	training's auc

In [33]:
u  = pd.read_csv('inputs/UnlabeledWiDS2021.csv', usecols=['encounter_id'])

u['diabetes_mellitus'] = y_preds
u.to_csv("lgbm_submission.csv", header=True, index=False)

In [35]:
u

Unnamed: 0,encounter_id,diabetes_mellitus
0,144740,0.081969
1,141990,0.090541
2,142038,0.087525
3,138628,0.099815
4,141682,0.254483
...,...,...
10229,143750,0.092499
10230,143813,0.091589
10231,137126,0.141301
10232,135652,0.021146
