In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load the dataset
file_path = 'Datasheet_kaggle.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Drop irrelevant columns
data = data.drop(columns=['eid', 'discharged'])

# Convert admit date to datetime format and extract useful features
data['vdate'] = pd.to_datetime(data['vdate'], errors='coerce')
data['admit_month'] = data['vdate'].dt.month
data['admit_day'] = data['vdate'].dt.day
data['admit_weekday'] = data['vdate'].dt.weekday
data = data.drop(columns=['vdate'])



In [3]:
data

Unnamed: 0,rcount,gender,dialysisrenalendstage,asthma,irondef,pneum,substancedependence,psychologicaldisordermajor,depress,psychother,...,bloodureanitro,creatinine,bmi,pulse,respiration,secondarydiagnosisnonicd9,lengthofstay,admit_month,admit_day,admit_weekday
0,0,F,0,0,0,0,0,0,0,0,...,12.0,1.390722,30.432418,96,6.5,4,3,8.0,29.0,2.0
1,5+,F,0,0,0,0,0,0,0,0,...,8.0,0.943164,28.460516,61,6.5,1,7,5.0,26.0,5.0
2,1,F,0,0,0,0,0,0,0,0,...,12.0,1.065750,28.843812,64,6.5,2,3,9.0,22.0,5.0
3,0,F,0,0,0,0,0,0,0,0,...,12.0,0.906862,27.959007,76,6.5,1,1,,,
4,0,F,0,0,0,1,0,1,0,0,...,11.5,1.242854,30.258927,67,5.6,2,4,12.0,20.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3,M,0,0,0,0,0,0,0,0,...,12.0,0.650323,30.063069,80,6.5,1,6,1.0,28.0,5.0
99996,0,M,0,0,0,0,0,0,0,0,...,12.0,1.521424,28.969548,61,6.5,1,1,,,
99997,1,M,0,0,1,0,0,0,0,0,...,12.0,1.025677,26.354919,61,6.9,1,4,7.0,23.0,0.0
99998,0,M,0,0,0,0,0,0,1,0,...,16.0,1.035400,29.193462,59,5.6,1,4,12.0,19.0,2.0


In [4]:
# Encode categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    data[col] = label_encoders[col].fit_transform(data[col])

# Handle missing values
imputer = SimpleImputer(strategy='mean')  # Replace missing values with column mean
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Separate features and target variable
X = data_imputed.drop(columns=['lengthofstay'])
y = data_imputed['lengthofstay']

# Normalize numerical features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Models to evaluate
models = {
    "Ridge": Ridge(random_state=42),
    "Lasso": Lasso(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42, n_estimators=100),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')
}



In [16]:
X.shape

(100000, 26)

In [17]:
# Train and evaluate each model
results = {}
for name, model in models.items():
    print("Name:", name)
    # Cross-validation R² scores
    cv_r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    print("cv_r2_scores: ", cv_r2_scores)
    cv_mean_r2 = cv_r2_scores.mean()
    cv_std_r2 = cv_r2_scores.std()
    
    # Train the model and predict on the test set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluate the model on the test set
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the results
    results[name] = {
        "Cross-Validation R² Scores": cv_r2_scores.tolist(),
        "Mean R² (CV)": cv_mean_r2,
        "Std R² (CV)": cv_std_r2,
        "MAE (Test)": mae,
        "MSE (Test)": mse,
        "R² (Test)": r2
    }
    print(results[name])

# Print the results
for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value}")


Name: Ridge
cv_r2_scores:  [0.75221853 0.7469107  0.74736621 0.74696524 0.7508431 ]
{'Cross-Validation R² Scores': [0.7522185291377053, 0.7469106950614963, 0.7473662087702453, 0.7469652351123787, 0.7508430955554111], 'Mean R² (CV)': 0.7488607527274473, 'Std R² (CV)': 0.002228619262164039, 'MAE (Test)': 0.8910729337338668, 'MSE (Test)': 1.3596255924244742, 'R² (Test)': 0.7522185291377051}
Name: Lasso
cv_r2_scores:  [0.38724523 0.38083089 0.38346459 0.3806891  0.37875796]
{'Cross-Validation R² Scores': [0.3872452271712308, 0.3808308883655763, 0.3834645856510258, 0.380689097433087, 0.3787579612735833], 'Mean R² (CV)': 0.38219755197890065, 'Std R² (CV)': 0.002934565996614834, 'MAE (Test)': 1.4781064828566512, 'MSE (Test)': 3.362305777421291, 'R² (Test)': 0.3872452271712309}
Name: Random Forest
cv_r2_scores:  [0.92678786 0.92365149 0.92453465 0.92493147 0.92360582]
{'Cross-Validation R² Scores': [0.92678786370174, 0.923651487692647, 0.9245346533733272, 0.9249314700830892, 0.923605824188653]

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



cv_r2_scores:  [0.96177549 0.96237566 0.96210499 0.96343753 0.96350495]


Parameters: { "use_label_encoder" } are not used.



{'Cross-Validation R² Scores': [0.9617754907531301, 0.9623756632206772, 0.9621049911547274, 0.9634375314791517, 0.9635049513306905], 'Mean R² (CV)': 0.9626397255876753, 'Std R² (CV)': 0.0007053625342384956, 'MAE (Test)': 0.337807980170846, 'MSE (Test)': 0.20585301038427242, 'R² (Test)': 0.9624848473148466}

Model: Ridge
Cross-Validation R² Scores: [0.7522185291377053, 0.7469106950614963, 0.7473662087702453, 0.7469652351123787, 0.7508430955554111]
Mean R² (CV): 0.7488607527274473
Std R² (CV): 0.002228619262164039
MAE (Test): 0.8910729337338668
MSE (Test): 1.3596255924244742
R² (Test): 0.7522185291377051

Model: Lasso
Cross-Validation R² Scores: [0.3872452271712308, 0.3808308883655763, 0.3834645856510258, 0.380689097433087, 0.3787579612735833]
Mean R² (CV): 0.38219755197890065
Std R² (CV): 0.002934565996614834
MAE (Test): 1.4781064828566512
MSE (Test): 3.362305777421291
R² (Test): 0.3872452271712309

Model: Random Forest
Cross-Validation R² Scores: [0.92678786370174, 0.923651487692647, 0