In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import lightgbm as lgb


In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


In [3]:
train_df.head()   # View the first few rows of the training data
train_df.info()   # Check the data types and null values
train_df.describe()  # Get statistical summary


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Data columns (total 60 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      28800 non-null  int64  
 1   dri_score               28646 non-null  object 
 2   psych_disturb           26738 non-null  object 
 3   cyto_score              20732 non-null  object 
 4   diabetes                26681 non-null  object 
 5   hla_match_c_high        24180 non-null  float64
 6   hla_high_res_8          22971 non-null  float64
 7   tbi_status              28800 non-null  object 
 8   arrhythmia              26598 non-null  object 
 9   hla_low_res_6           25530 non-null  float64
 10  graft_type              28800 non-null  object 
 11  vent_hist               28541 non-null  object 
 12  renal_issue             26885 non-null  object 
 13  pulm_severe             26665 non-null  object 
 14  prim_disease_hct        28800 non-null

Unnamed: 0,ID,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,...,age_at_hct,hla_match_a_low,hla_match_b_high,comorbidity_score,karnofsky_score,hla_low_res_8,hla_match_drb1_high,hla_low_res_10,efs,efs_time
count,28800.0,24180.0,22971.0,25530.0,23516.0,21637.0,23601.0,24603.0,26000.0,26157.0,...,28800.0,26410.0,24712.0,28323.0,27930.0,25147.0,25448.0,23736.0,28800.0,28800.0
mean,14399.5,1.764516,6.876801,5.143322,5.109202,8.61723,1.736876,5.160346,1.757808,1.715296,...,38.663162,1.709087,1.69962,1.702327,83.83208,6.903448,1.707128,8.664687,0.539306,23.237678
std,8313.988213,0.431941,1.564313,1.207757,1.214162,1.905125,0.447687,1.20324,0.435453,0.451282,...,21.147581,0.458259,0.46518,1.994443,11.02884,1.565017,0.461179,1.882746,0.498461,24.799748
min,0.0,0.0,2.0,2.0,0.0,3.0,0.0,2.0,0.0,1.0,...,0.044,0.0,0.0,0.0,40.0,2.0,0.0,4.0,0.0,0.333
25%,7199.75,2.0,6.0,4.0,4.0,7.0,1.0,4.0,2.0,1.0,...,19.539,1.0,1.0,0.0,70.0,6.0,1.0,7.0,0.0,5.61975
50%,14399.5,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,...,41.006,2.0,2.0,1.0,90.0,8.0,2.0,10.0,1.0,9.7965
75%,21599.25,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,...,55.96525,2.0,2.0,2.0,90.0,8.0,2.0,10.0,1.0,35.1
max,28799.0,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,...,73.726,2.0,2.0,10.0,100.0,8.0,2.0,10.0,1.0,156.819


In [4]:
# Check for missing values in the entire dataframe
train_df.isnull().sum()


ID                            0
dri_score                   154
psych_disturb              2062
cyto_score                 8068
diabetes                   2119
hla_match_c_high           4620
hla_high_res_8             5829
tbi_status                    0
arrhythmia                 2202
hla_low_res_6              3270
graft_type                    0
vent_hist                   259
renal_issue                1915
pulm_severe                2135
prim_disease_hct              0
hla_high_res_6             5284
cmv_status                  634
hla_high_res_10            7163
hla_match_dqb1_high        5199
tce_imm_match             11133
hla_nmdp_6                 4197
hla_match_c_low            2800
rituximab                  2148
hla_match_drb1_low         2643
hla_match_dqb1_low         4194
prod_type                     0
cyto_score_detail         11923
conditioning_intensity     4789
ethnicity                   587
year_hct                      0
obesity                    1760
mrd_hct 

In [6]:
missing_columns = train_df.columns[train_df.isnull().any()]
print(missing_columns)


Index(['dri_score', 'psych_disturb', 'cyto_score', 'diabetes',
       'hla_match_c_high', 'hla_high_res_8', 'arrhythmia', 'hla_low_res_6',
       'vent_hist', 'renal_issue', 'pulm_severe', 'hla_high_res_6',
       'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match',
       'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low',
       'hla_match_dqb1_low', 'cyto_score_detail', 'conditioning_intensity',
       'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match',
       'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor',
       'hla_match_b_low', 'peptic_ulcer', 'hla_match_a_low', 'gvhd_proph',
       'rheum_issue', 'sex_match', 'hla_match_b_high', 'comorbidity_score',
       'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related',
       'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high',
       'pulm_moderate', 'hla_low_res_10'],
      dtype='object')


In [8]:
for column in missing_columns:
    if train_df[column].dtype != 'object':  # Ensure it's a numerical column
        train_df[column] = train_df[column].fillna(train_df[column].mean())


In [9]:
print(train_df[missing_columns].isnull().sum())  # Verify missing values are filled


dri_score                   154
psych_disturb              2062
cyto_score                 8068
diabetes                   2119
hla_match_c_high              0
hla_high_res_8                0
arrhythmia                 2202
hla_low_res_6                 0
vent_hist                   259
renal_issue                1915
pulm_severe                2135
hla_high_res_6                0
cmv_status                  634
hla_high_res_10               0
hla_match_dqb1_high           0
tce_imm_match             11133
hla_nmdp_6                    0
hla_match_c_low               0
rituximab                  2148
hla_match_drb1_low            0
hla_match_dqb1_low            0
cyto_score_detail         11923
conditioning_intensity     4789
ethnicity                   587
obesity                    1760
mrd_hct                   16597
in_vivo_tcd                 225
tce_match                 18996
hla_match_a_high              0
hepatic_severe             1871
donor_age                     0
prior_tu

In [10]:
print(train_df[missing_columns].dtypes)  # Check data types of missing columns


dri_score                  object
psych_disturb              object
cyto_score                 object
diabetes                   object
hla_match_c_high          float64
hla_high_res_8            float64
arrhythmia                 object
hla_low_res_6             float64
vent_hist                  object
renal_issue                object
pulm_severe                object
hla_high_res_6            float64
cmv_status                 object
hla_high_res_10           float64
hla_match_dqb1_high       float64
tce_imm_match              object
hla_nmdp_6                float64
hla_match_c_low           float64
rituximab                  object
hla_match_drb1_low        float64
hla_match_dqb1_low        float64
cyto_score_detail          object
conditioning_intensity     object
ethnicity                  object
obesity                    object
mrd_hct                    object
in_vivo_tcd                object
tce_match                  object
hla_match_a_high          float64
hepatic_severe

In [11]:
for column in missing_columns:
    if train_df[column].dtype == 'object':  # Handle non-numerical columns
        train_df[column] = train_df[column].fillna(train_df[column].mode()[0])
    else:  # Handle numerical columns
        train_df[column] = train_df[column].fillna(train_df[column].mean())


In [12]:
print(train_df[missing_columns].isnull().sum())  # Verify all missing values are filled


dri_score                 0
psych_disturb             0
cyto_score                0
diabetes                  0
hla_match_c_high          0
hla_high_res_8            0
arrhythmia                0
hla_low_res_6             0
vent_hist                 0
renal_issue               0
pulm_severe               0
hla_high_res_6            0
cmv_status                0
hla_high_res_10           0
hla_match_dqb1_high       0
tce_imm_match             0
hla_nmdp_6                0
hla_match_c_low           0
rituximab                 0
hla_match_drb1_low        0
hla_match_dqb1_low        0
cyto_score_detail         0
conditioning_intensity    0
ethnicity                 0
obesity                   0
mrd_hct                   0
in_vivo_tcd               0
tce_match                 0
hla_match_a_high          0
hepatic_severe            0
donor_age                 0
prior_tumor               0
hla_match_b_low           0
peptic_ulcer              0
hla_match_a_low           0
gvhd_proph          

In [13]:
# Example: Identify rows with remaining missing values
print(train_df[missing_columns][train_df[missing_columns].isnull().any(axis=1)])


Empty DataFrame
Columns: [dri_score, psych_disturb, cyto_score, diabetes, hla_match_c_high, hla_high_res_8, arrhythmia, hla_low_res_6, vent_hist, renal_issue, pulm_severe, hla_high_res_6, cmv_status, hla_high_res_10, hla_match_dqb1_high, tce_imm_match, hla_nmdp_6, hla_match_c_low, rituximab, hla_match_drb1_low, hla_match_dqb1_low, cyto_score_detail, conditioning_intensity, ethnicity, obesity, mrd_hct, in_vivo_tcd, tce_match, hla_match_a_high, hepatic_severe, donor_age, prior_tumor, hla_match_b_low, peptic_ulcer, hla_match_a_low, gvhd_proph, rheum_issue, sex_match, hla_match_b_high, comorbidity_score, karnofsky_score, hepatic_mild, tce_div_match, donor_related, melphalan_dose, hla_low_res_8, cardiac, hla_match_drb1_high, pulm_moderate, hla_low_res_10]
Index: []

[0 rows x 50 columns]


In [14]:
print(train_df[missing_columns].head())  # Inspect a few rows to confirm filled values


                        dri_score psych_disturb    cyto_score diabetes  \
0  N/A - non-malignant indication            No          Poor       No   
1                    Intermediate            No  Intermediate       No   
2  N/A - non-malignant indication            No          Poor       No   
3                            High            No  Intermediate       No   
4                            High            No          Poor       No   

   hla_match_c_high  hla_high_res_8 arrhythmia  hla_low_res_6 vent_hist  \
0          1.764516        6.876801         No            6.0        No   
1          2.000000        8.000000         No            6.0        No   
2          2.000000        8.000000         No            6.0        No   
3          2.000000        8.000000         No            6.0        No   
4          2.000000        8.000000         No            6.0        No   

  renal_issue  ... karnofsky_score  hepatic_mild          tce_div_match  \
0          No  ...           