In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import lightgbm as lgb


In [16]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


In [6]:
train_df.head()   # View the first few rows of the training data
train_df.info()   # Check the data types and null values
train_df.describe()  # Get statistical summary


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Data columns (total 60 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      28800 non-null  int64  
 1   dri_score               28646 non-null  object 
 2   psych_disturb           26738 non-null  object 
 3   cyto_score              20732 non-null  object 
 4   diabetes                26681 non-null  object 
 5   hla_match_c_high        24180 non-null  float64
 6   hla_high_res_8          22971 non-null  float64
 7   tbi_status              28800 non-null  object 
 8   arrhythmia              26598 non-null  object 
 9   hla_low_res_6           25530 non-null  float64
 10  graft_type              28800 non-null  object 
 11  vent_hist               28541 non-null  object 
 12  renal_issue             26885 non-null  object 
 13  pulm_severe             26665 non-null  object 
 14  prim_disease_hct        28800 non-null

Unnamed: 0,ID,hla_match_c_high,hla_high_res_8,hla_low_res_6,hla_high_res_6,hla_high_res_10,hla_match_dqb1_high,hla_nmdp_6,hla_match_c_low,hla_match_drb1_low,...,age_at_hct,hla_match_a_low,hla_match_b_high,comorbidity_score,karnofsky_score,hla_low_res_8,hla_match_drb1_high,hla_low_res_10,efs,efs_time
count,28800.0,24180.0,22971.0,25530.0,23516.0,21637.0,23601.0,24603.0,26000.0,26157.0,...,28800.0,26410.0,24712.0,28323.0,27930.0,25147.0,25448.0,23736.0,28800.0,28800.0
mean,14399.5,1.764516,6.876801,5.143322,5.109202,8.61723,1.736876,5.160346,1.757808,1.715296,...,38.663162,1.709087,1.69962,1.702327,83.83208,6.903448,1.707128,8.664687,0.539306,23.237678
std,8313.988213,0.431941,1.564313,1.207757,1.214162,1.905125,0.447687,1.20324,0.435453,0.451282,...,21.147581,0.458259,0.46518,1.994443,11.02884,1.565017,0.461179,1.882746,0.498461,24.799748
min,0.0,0.0,2.0,2.0,0.0,3.0,0.0,2.0,0.0,1.0,...,0.044,0.0,0.0,0.0,40.0,2.0,0.0,4.0,0.0,0.333
25%,7199.75,2.0,6.0,4.0,4.0,7.0,1.0,4.0,2.0,1.0,...,19.539,1.0,1.0,0.0,70.0,6.0,1.0,7.0,0.0,5.61975
50%,14399.5,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,...,41.006,2.0,2.0,1.0,90.0,8.0,2.0,10.0,1.0,9.7965
75%,21599.25,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,...,55.96525,2.0,2.0,2.0,90.0,8.0,2.0,10.0,1.0,35.1
max,28799.0,2.0,8.0,6.0,6.0,10.0,2.0,6.0,2.0,2.0,...,73.726,2.0,2.0,10.0,100.0,8.0,2.0,10.0,1.0,156.819


In [7]:
# Check for missing values in the entire dataframe
train_df.isnull().sum()


ID                            0
dri_score                   154
psych_disturb              2062
cyto_score                 8068
diabetes                   2119
hla_match_c_high           4620
hla_high_res_8             5829
tbi_status                    0
arrhythmia                 2202
hla_low_res_6              3270
graft_type                    0
vent_hist                   259
renal_issue                1915
pulm_severe                2135
prim_disease_hct              0
hla_high_res_6             5284
cmv_status                  634
hla_high_res_10            7163
hla_match_dqb1_high        5199
tce_imm_match             11133
hla_nmdp_6                 4197
hla_match_c_low            2800
rituximab                  2148
hla_match_drb1_low         2643
hla_match_dqb1_low         4194
prod_type                     0
cyto_score_detail         11923
conditioning_intensity     4789
ethnicity                   587
year_hct                      0
obesity                    1760
mrd_hct 