In [17]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, ElasticNet, Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
import warnings
import optuna
warnings.filterwarnings(action='ignore')
import statsmodels.api as sm

In [10]:
X_train = pd.read_csv("X_train_new.csv")
X_train = X_train.dropna(axis = 1)
y_train = pd.read_csv("Y_train_T2.csv")
y_train = y_train["los_icu"]

X_valid = pd.read_csv("X_valid_new.csv")
X_valid = X_valid.dropna(axis = 1)
y_valid = pd.read_csv("Y_valid_T2.csv")
y_valid = y_valid["los_icu"]

X_train = X_train.append(X_valid)
y_train = y_train.append(y_valid)

In [20]:
X_train = pd.read_csv("aggregate_train_2.csv")
#X_train = X_train.drop(labels = ['Unnamed: 0'], axis = 1)
X_train = X_train.dropna(axis = 1)
y_train = pd.read_csv("Y_train_T2.csv")
#y_train = y_train.drop(labels = ['Unnamed: 0'], axis = 1)
y_train = y_train["los_icu"]

X_valid = pd.read_csv("aggregate_valid_2.csv")
X_valid = X_valid.dropna(axis = 1)
#X_valid = X_valid.drop(labels = ['Unnamed: 0'], axis = 1)
y_valid = pd.read_csv("Y_valid_T2.csv")
#y_valid = y_valid.drop(labels = ['Unnamed: 0'], axis = 1)
y_valid = y_valid["los_icu"]

X_train = X_train.append(X_valid)
y_train = y_train.append(y_valid)

In [21]:
def objective(trial,data=X_train,target=y_train):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    param = {
        'l1_ratio': trial.suggest_loguniform('l1_ratio', 1e-4, 1),
        'alpha': trial.suggest_loguniform('alpha', 1e-4, 10.0),
    }
    model = ElasticNet(**param)  
    
    model.fit(train_x,train_y)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [22]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-11-26 11:39:21,706][0m A new study created in memory with name: no-name-1f5a0df5-527e-4fc6-a0e2-9e3d7c6499f0[0m
[32m[I 2022-11-26 11:39:21,898][0m Trial 0 finished with value: 1.925888834103422 and parameters: {'l1_ratio': 0.3482584901090277, 'alpha': 5.281215387149903}. Best is trial 0 with value: 1.925888834103422.[0m
[32m[I 2022-11-26 11:39:47,867][0m Trial 1 finished with value: 1.7827728929309066 and parameters: {'l1_ratio': 0.004662233306528392, 'alpha': 0.012517659598354986}. Best is trial 1 with value: 1.7827728929309066.[0m
[32m[I 2022-11-26 11:40:12,125][0m Trial 2 finished with value: 1.7891013911885612 and parameters: {'l1_ratio': 0.0009698164679292429, 'alpha': 0.0006623038190420491}. Best is trial 1 with value: 1.7827728929309066.[0m
[32m[I 2022-11-26 11:40:37,463][0m Trial 3 finished with value: 1.7885168220083967 and parameters: {'l1_ratio': 0.001569494358987349, 'alpha': 0.0008946544186447415}. Best is trial 1 with value: 1.7827728929309066.[

Number of finished trials: 30
Best trial: {'l1_ratio': 0.2821949693938196, 'alpha': 0.019954749735800376}


In [23]:
model = ElasticNet(alpha=0.2821949693938196, l1_ratio=0.019954749735800376)
model.fit(X_train, y_train)
y_pred_valid = model.predict(X_valid)
y_pred_train = model.predict(X_train)
print('RMSE of train:' , mean_squared_error(y_train, y_pred_train, squared=False))
print('RMSE of valid:' , mean_squared_error(y_valid, y_pred_valid, squared=False))

RMSE of train: 1.8026250685354024
RMSE of valid: 1.795668722903245


In [24]:
X2 = sm.add_constant(X_train)
est = sm.OLS(y_train, X2).fit_regularized(alpha=0.2821949693938196, L1_wt=0.019954749735800376, refit=True)
est.summary()

0,1,2,3
Dep. Variable:,los_icu,R-squared:,0.18
Model:,OLS,Adj. R-squared:,0.175
Method:,Least Squares,F-statistic:,32.39
Date:,"Sat, 26 Nov 2022",Prob (F-statistic):,0.0
Time:,11:49:17,Log-Likelihood:,-38343.0
No. Observations:,19154,AIC:,76950.0
Df Residuals:,19025,BIC:,77970.0
Df Model:,129,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.8770,0.111,25.945,0.000,2.660,3.094
glucose_mean,0,0,,,0,0
glucose_mask,-0.0035,0.005,-0.766,0.443,-0.013,0.006
glucose_range,-0.0304,0.036,-0.840,0.401,-0.101,0.040
glucose_std,0.1263,0.117,1.084,0.278,-0.102,0.355
hematocrit_mean,0,0,,,0,0
hematocrit_mask,-0.0320,0.011,-2.932,0.003,-0.053,-0.011
hematocrit_range,0.0480,0.031,1.533,0.125,-0.013,0.109
hematocrit_std,0,0,,,0,0

0,1,2,3
Omnibus:,3494.092,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6090.684
Skew:,1.185,Prob(JB):,0.0
Kurtosis:,4.421,Cond. No.,
