In [1]:
%pip install pandas scikit-learn tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

# Loading dataset

Load train and test dataset and merge them into one dataframe. This is done to ensure that the LabelEncoder is applied to all the data.


In [3]:
TARGET = 'metastatic_diagnosis_period'

train_df = pd.read_csv("data/train.csv")
train_df['isTrain'] = 1
test_df = pd.read_csv("data/test.csv")
test_df['isTrain'] = 0
test_df[TARGET] = np.nan

df = pd.concat([train_df, test_df], axis=0)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,bmi,...,Average of May-18,Average of Jun-18,Average of Jul-18,Average of Aug-18,Average of Sep-18,Average of Oct-18,Average of Nov-18,Average of Dec-18,metastatic_diagnosis_period,isTrain
0,268700,,COMMERCIAL,AR,724,South,West South Central,39,F,,...,74.77,79.96,81.69,78.3,74.56,59.98,42.98,41.18,191.0,1
1,484983,White,,IL,629,Midwest,East North Central,55,F,35.36,...,72.87,77.4,77.43,75.83,72.64,58.36,39.68,39.71,33.0,1
2,277055,,COMMERCIAL,CA,925,West,Pacific,59,F,,...,70.31,78.61,87.24,85.52,80.75,70.81,62.67,55.58,157.0,1
3,320055,Hispanic,MEDICAID,CA,900,West,Pacific,59,F,,...,63.1,67.45,75.86,75.24,71.1,68.95,65.46,59.46,146.0,1
4,190386,,COMMERCIAL,CA,934,West,Pacific,71,F,,...,60.24,64.77,69.81,70.13,68.1,65.38,60.72,54.08,286.0,1


In [4]:
df.shape

(18819, 153)

In [5]:
numeric_cols = []
categorical_cols = []
cols_to_encode = []
ignore_cols = ["patient_id", TARGET, "isTrain"]
for col in df.columns:
    if col in ignore_cols:
        continue

    unique_entries = len(df[col].unique())
    data_type = df[col].dtype
    if unique_entries <= 20 and data_type != "object":
        categorical_cols.append(col)
    elif data_type == "object":
        cols_to_encode.append(col)
    else:
        numeric_cols.append(col)

print('Number of numeric columns:', len(numeric_cols))
print('Number of categorical columns:', len(categorical_cols) + len(cols_to_encode))

Number of numeric columns: 139
Number of categorical columns: 11


Fill in missing values with the most mean value of the column.

In [6]:
missing_values = {}
for col in numeric_cols:
    num_missing_values = df[col].isnull().sum()
    if num_missing_values > 0:
        df[col] = df[col].fillna(np.nanmean(df[col].values))
        missing_values[col] = np.nanmean(df[col].values)

for col in categorical_cols:
    num_missing_values = df[col].isnull().sum()
    if num_missing_values > 0:
        df[col] = df[col].fillna(df[col].median())
        missing_values[col] = df[col].median()

In [7]:
joblib.dump(missing_values, 'data/utils/missing_values.pkl')
missing_values

{'bmi': 29.161171000516614,
 'family_size': 3.1991474433932185,
 'family_dual_income': 51.774213883278414,
 'income_household_median': 74018.07291591367,
 'income_household_under_5': 3.297570957797385,
 'income_household_5_to_10': 2.540279047517806,
 'income_household_10_to_15': 4.170900924843202,
 'income_household_15_to_20': 3.9528452216434564,
 'income_household_20_to_25': 4.089846922504518,
 'income_household_25_to_35': 8.442695333262463,
 'income_household_35_to_50': 11.604751249069842,
 'income_household_50_to_75': 16.894366960773887,
 'income_household_75_to_100': 12.659270755820133,
 'income_household_100_to_150': 15.78922079302647,
 'income_household_150_over': 16.559315934942063,
 'income_household_six_figure': 32.34875677686829,
 'home_ownership': 65.83883544169235,
 'home_value': 337917.4122839375,
 'rent_median': 1230.8486494100139,
 'rent_burden': 31.270572977569895,
 'self_employed': 13.206606250664398,
 'farmer': 1.9501233124269162,
 'poverty': 13.45155469331349,
 'limi

# Label Encoding

In [8]:
encoders = {}
for col in cols_to_encode:
    encoder = LabelEncoder().fit(df[col])
    encoders[col] = encoder
    df[col] = encoder.transform(df[col])

joblib.dump(
    encoders,
    "data/utils/encoders.pkl",
)

['data/utils/encoders.pkl']

In [9]:
for col in categorical_cols + cols_to_encode + numeric_cols:
    if df[col].isnull().sum() > 0:
        print(col, df[col].dtype, df[col].isnull().sum())

# One Hot Encoding

In [10]:
dummy_cols = []
map_dummy = {}
for col in tqdm(categorical_cols + cols_to_encode, total=len(categorical_cols) + len(cols_to_encode)):
    for value in tqdm(df[col].unique()):
        df[f"dummy_{col}_{value}"] = 0
        df.loc[df[col] == value, f"dummy_{col}_{value}"] = 1
        dummy_cols.append(f"dummy_{col}_{value}")
        map_dummy[col] = f"dummy_{col}_{value}"
joblib.dump(map_dummy, "data/utils/map_dummy.pkl")

  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
100%|██████████| 6/6 [00:00<00:00, 537.10it/s]
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
100%|██████████| 4/4 [00:00<00:00, 1440.85it/s]
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{col}_{value}"] = 0
  df[f"dummy_{

['data/utils/map_dummy.pkl']

In [11]:
train_df = df.loc[df["isTrain"] == 1].reset_index(drop=True)
test_df = df.loc[df["isTrain"] == 0].reset_index(drop=True)
train_df[TARGET]

0        191.0
1         33.0
2        157.0
3        146.0
4        286.0
         ...  
13168    106.0
13169     92.0
13170      0.0
13171    330.0
13172      0.0
Name: metastatic_diagnosis_period, Length: 13173, dtype: float64

In [12]:
train_df.head()

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,Region,Division,patient_age,patient_gender,bmi,...,dummy_metastatic_cancer_diagnosis_code_31,dummy_metastatic_cancer_diagnosis_code_40,dummy_metastatic_cancer_diagnosis_code_21,dummy_metastatic_cancer_diagnosis_code_25,dummy_metastatic_cancer_diagnosis_code_13,dummy_metastatic_first_novel_treatment_2,dummy_metastatic_first_novel_treatment_0,dummy_metastatic_first_novel_treatment_1,dummy_metastatic_first_novel_treatment_type_1,dummy_metastatic_first_novel_treatment_type_0
0,268700,5,0,2,724,2,7,39,0,29.161171,...,0,0,0,0,0,1,0,0,1,0
1,484983,4,3,13,629,0,0,55,0,35.36,...,0,0,0,0,0,1,0,0,1,0
2,277055,5,0,4,925,3,4,59,0,29.161171,...,0,0,0,0,0,1,0,0,1,0
3,320055,2,1,4,900,3,4,59,0,29.161171,...,0,0,0,0,0,1,0,0,1,0
4,190386,5,0,4,934,3,4,71,0,29.161171,...,0,0,0,0,0,1,0,0,1,0


In [13]:
train_df.to_csv('data/train_preprocessed.csv', index=False)
test_df.to_csv('data/test_preprocessed.csv', index=False)