In [None]:
pip install catboost

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor

from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv("/content/drive/MyDrive/WiDS Datathon 2024/Dataset/train.csv")
data.head()

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,treatment_pd
0,994155,Asian,COMMERCIAL,CA,917,46,F,27.0,C50811,Malignant neoplasm of ovrlp sites of right fem...,...,18.858696,11.426087,47.726087,9.895652,10.515217,12.745652,32.530435,7.263043,3.81087,35
1,154389,,MEDICARE ADVANTAGE,OH,451,63,F,,C50412,Malig neoplasm of upper-outer quadrant of left...,...,0.255319,2.234043,1.182979,18.317021,13.546809,0.146809,31.890909,7.631915,9.631915,33
2,387343,,COMMERCIAL,TX,773,53,F,,C50212,Malig neoplasm of upper-inner quadrant of left...,...,3.588679,7.915094,21.064151,14.083019,11.943396,2.549057,32.55625,16.396226,10.392453,24
3,921275,Hispanic,MEDICAID,CA,928,50,F,,1749,"Malignant neoplasm of breast (female), unspeci...",...,11.645455,10.081818,37.948485,8.957576,10.109091,8.057576,30.606061,7.018182,4.10303,455
4,803454,,COMMERCIAL,NY,112,39,F,18.0,1749,"Malignant neoplasm of breast (female), unspeci...",...,9.184211,6.089474,18.960526,10.194737,18.642105,14.173684,42.502632,6.392105,1.755263,162


### **Data preparation**

In [None]:
pd.set_option('display.max_rows', None)
data.isna().sum()

patient_id                                   0
patient_race                             15152
payer_type                                2836
patient_state                               79
patient_zip3                                 0
patient_age                                  0
patient_gender                               0
bmi                                      18925
breast_cancer_diagnosis_code                 0
breast_cancer_diagnosis_desc                 0
breast_cancer_diagnosis_year                 0
metastatic_cancer_diagnosis_code             0
metastatic_first_treatment                   0
metastatic_first_treatment_type          11024
metastatic_first_novel_treatment         27441
metastatic_first_novel_treatment_type    27441
region                                     223
division                                   223
population                                   0
density                                      0
age_median                                   0
age_under_10 

#### Drop missing values

In [None]:
features = [c for c in data.columns if c not in ["patient_race",
                                                 "payer_type",
                                                 "bmi",
                                                 "metastatic_first_novel_treatment",
                                                 "metastatic_first_novel_treatment_type",
                                                 "metastatic_first_treatment_type",
                                                 "self_employed",
                                                 "farmer"]]
data = data.dropna(subset = features).reset_index()

#### Transform metastatic_first_treatment_type to binary values
1 for Antineoplastics, 0 for other



In [None]:
pd.set_option('display.max_columns', None)
data["metastatic_first_treatment_type"].unique()

data["metastatic_first_treatment_type_binary"] = data["metastatic_first_treatment_type"]

condition1 = (data["metastatic_first_treatment_type"] == "Antineoplastics")
data["metastatic_first_treatment_type_binary"] = np.where(condition1, 1,
                                                          data["metastatic_first_treatment_type_binary"])

condition2 = (data["metastatic_first_treatment_type"] != "Antineoplastics")
data["metastatic_first_treatment_type_binary"] = np.where(condition2, 0,
                                                          data["metastatic_first_treatment_type_binary"])

data.head()

Unnamed: 0,index,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,breast_cancer_diagnosis_year,metastatic_cancer_diagnosis_code,metastatic_first_treatment,metastatic_first_treatment_type,metastatic_first_novel_treatment,metastatic_first_novel_treatment_type,region,division,population,density,age_median,age_under_10,age_10_to_19,age_20s,age_30s,age_40s,age_50s,age_60s,age_70s,age_over_80,male,female,married,divorced,never_married,widowed,family_size,family_dual_income,income_household_median,income_household_under_5,income_household_5_to_10,income_household_10_to_15,income_household_15_to_20,income_household_20_to_25,income_household_25_to_35,income_household_35_to_50,income_household_50_to_75,income_household_75_to_100,income_household_100_to_150,income_household_150_over,income_household_six_figure,income_individual_median,home_ownership,housing_units,home_value,rent_median,rent_burden,education_less_highschool,education_highschool,education_some_college,education_bachelors,education_graduate,education_college_or_above,education_stem_degree,labor_force_participation,unemployment_rate,self_employed,farmer,race_white,race_black,race_asian,race_native,race_pacific,race_other,race_multiple,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,treatment_pd,metastatic_first_treatment_type_binary
0,0,994155,Asian,COMMERCIAL,CA,917,46,F,27.0,C50811,Malignant neoplasm of ovrlp sites of right fem...,2018,C779,DOXORUBICIN HCL,,,,West,Pacific,43031,2048.578261,38.852174,11.306522,12.897826,14.121739,13.532609,13.16087,13.378261,11.473913,6.380435,3.736957,49.052174,50.947826,48.504348,10.117391,36.408696,4.969565,3.674783,59.219565,86330.3913,2.226087,1.528261,2.897826,2.747826,3.173913,6.647826,9.617391,15.965217,13.58913,19.752174,21.847826,41.6,34317.82609,61.397826,12609.26087,572606.5,1778.0,34.595652,17.491304,22.656522,29.263043,20.2,10.404348,30.604348,46.208696,63.154348,6.197826,15.708696,0.015217,38.708696,3.963043,25.565217,1.193478,0.269565,18.858696,11.426087,47.726087,9.895652,10.515217,12.745652,32.530435,7.263043,3.81087,35,0
1,1,154389,,MEDICARE ADVANTAGE,OH,451,63,F,,C50412,Malig neoplasm of upper-outer quadrant of left...,2018,C7951,DOXORUBICIN HCL,,,,Midwest,East North Central,7228,194.65625,41.247826,12.855319,12.789362,11.261702,10.489362,11.859574,15.278723,13.359574,6.434043,5.66383,52.091489,47.908511,50.67234,14.102128,27.117021,8.112766,3.119565,51.228261,65214.72093,2.329787,3.248936,4.468085,5.878723,5.323404,7.840426,12.246809,20.040426,14.051064,15.676596,8.902128,24.578723,32142.22727,72.391489,2789.958333,155901.7692,828.0,26.514286,15.829787,38.968085,27.682979,11.625532,5.887234,17.512766,38.308889,61.27234,5.793478,11.2025,3.715,96.055319,1.006383,0.321277,0.117021,0.002128,0.255319,2.234043,1.182979,18.317021,13.546809,0.146809,31.890909,7.631915,9.631915,33,0
2,2,387343,,COMMERCIAL,TX,773,53,F,,C50212,Malig neoplasm of upper-inner quadrant of left...,2018,C773,PACLITAXEL,Antineoplastics,,,South,West South Central,24751,352.226786,41.371154,11.930189,12.986792,10.996226,11.162264,13.107547,13.022642,13.066038,9.577358,4.158491,49.354717,50.645283,52.99434,13.341509,25.09434,8.579245,3.205577,47.794231,77147.40816,3.198113,2.358491,3.422642,3.824528,3.973585,8.424528,12.401887,16.90566,12.471698,15.154717,17.849057,33.003774,37016.74,74.060377,8450.339286,213648.0222,1207.695652,28.147826,13.256604,29.3,31.066038,17.632075,8.743396,26.375472,45.466667,57.926415,5.423077,13.245455,2.3,76.873585,8.90566,2.267925,0.381132,0.060377,3.588679,7.915094,21.064151,14.083019,11.943396,2.549057,32.55625,16.396226,10.392453,24,1
3,3,921275,Hispanic,MEDICAID,CA,928,50,F,,1749,"Malignant neoplasm of breast (female), unspeci...",2015,C787,GEMCITABINE HCL,,,,West,Pacific,39122,2295.939394,38.2,11.878788,13.354545,14.230303,13.418182,13.333333,14.060606,10.248485,5.951515,3.50303,49.893939,50.106061,50.245455,9.827273,35.290909,4.651515,3.622727,61.736364,102741.6364,2.327273,1.536364,2.648485,2.178788,2.409091,5.163636,7.972727,13.936364,12.469697,19.760606,29.59697,49.357576,41287.27273,61.463636,11725.66667,677688.5152,2003.125,34.753125,14.230303,19.987879,29.79697,23.739394,12.245455,35.984848,47.918182,65.230303,5.10303,15.224242,0.027273,54.030303,2.527273,20.827273,0.587879,0.3,11.645455,10.081818,37.948485,8.957576,10.109091,8.057576,30.606061,7.018182,4.10303,455,0
4,4,803454,,COMMERCIAL,NY,112,39,F,18.0,1749,"Malignant neoplasm of breast (female), unspeci...",2015,C7989,DOXORUBICIN HCL,,,,Northeast,Middle Atlantic,71374,17326.40789,36.476316,12.986842,11.318421,14.971053,17.255263,12.631579,11.460526,9.789474,6.0,3.581579,47.668421,52.331579,39.923684,10.239474,44.642105,5.186842,3.412105,53.447368,74499.71053,4.334211,3.305263,5.863158,4.460526,4.042105,7.589474,9.897368,13.542105,10.742105,14.889474,21.318421,36.207895,39491.78947,29.931579,25922.55263,870873.1842,1678.447368,35.213158,16.2,24.334211,18.447368,24.371053,16.655263,41.026316,40.857895,64.197368,7.184211,18.145946,0.002703,44.1,28.831579,11.205263,0.515789,0.068421,9.184211,6.089474,18.960526,10.194737,18.642105,14.173684,42.502632,6.392105,1.755263,162,0


#### Fill "patient_race" missing values with regional data

In [None]:
race_columns = ["race_white", "race_black", "race_asian", "race_native",
               "race_pacific", "race_other", "race_multiple"]

data["patient_race"].fillna(data[race_columns].idxmax(axis=1), inplace = True)

In [None]:
data["patient_race"].unique()

array(['Asian', 'race_white', 'Hispanic', 'White', 'Other', 'Black',
       'race_black', 'race_asian', 'race_native'], dtype=object)

In [None]:
data["patient_race"] = data["patient_race"].apply(lambda x: "White" if x == "race_white" else x)
data["patient_race"] = data["patient_race"].apply(lambda x: "Black" if x == "race_black" else x)
data["patient_race"] = data["patient_race"].apply(lambda x: "Asian" if x == "race_asian" else x)
data["patient_race"] = data["patient_race"].apply(lambda x: "Other" if x == "race_native" else x)

In [None]:
print("sum of missing values in patient_race: ", data["patient_race"].isnull().sum())

sum of missing values in patient_race:  0


#### Fill "payer_type" missing values with "NONE"
Assuming null/NaN means no insurance

In [None]:
data["payer_type"].fillna("NONE", inplace = True)
data["payer_type"].unique()

array(['COMMERCIAL', 'MEDICARE ADVANTAGE', 'MEDICAID', 'NONE'],
      dtype=object)

### **CatBoost Model**

In [None]:
predictors = ["patient_race", "payer_type", "patient_age", "patient_zip3",
              "breast_cancer_diagnosis_code", "density", "age_median",
              "family_size", "income_individual_median", "poverty",
              "education_stem_degree",  "limited_english", "disabled",
              "health_uninsured", "metastatic_first_treatment",
              "breast_cancer_diagnosis_year"]

contin = ["patient_age", "patient_zip3", "density", "age_median", "family_size",
          "income_individual_median", "poverty", "education_stem_degree",
          "limited_english", "health_uninsured", "disabled"]

cat = ["patient_race", "breast_cancer_diagnosis_code", "payer_type",
       "metastatic_first_treatment", "breast_cancer_diagnosis_year"]

In [None]:
X_train = data[predictors]
y_train = data['treatment_pd']

pre = make_column_transformer((StandardScaler(), contin),
                              (OneHotEncoder(handle_unknown="ignore"), cat),
                              remainder="passthrough")

catboost = CatBoostRegressor(learning_rate=0.03,
                             iterations=700,
                             silent=True)

pipe = Pipeline([("pre", pre),
                 ("cat", catboost)])

pipe.fit(X_train, y_train)

predictions = pipe.predict(X_train)

rmse = sqrt(mean_squared_error(y_train, predictions))
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 135.9598077931778


In [None]:
test = pd.read_csv("/content/drive/MyDrive/WiDS Datathon 2024/Dataset/test.csv")

X_test = test[predictors]
predictions = pipe.predict(X_test)

In [None]:
pred_df = pd.DataFrame({"patient_id": test.patient_id,
    "treatment_pd": predictions})
pred_df.to_csv("patientPred.csv", index= False)

preds = pd.read_csv("/content/patientPred.csv")
preds.shape

(11845, 2)