In [None]:
!pip install xgboost
!pip install s3fs

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from itertools import product
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split,train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,mean_squared_error, mean_absolute_error, r2_score,confusion_matrix
from sklearn.preprocessing import MinMaxScaler

In [2]:
def train_xgboost_model(X_train, X_test, y_train, y_test):
    # Split the data into training and testing sets

    # Define the XGBoost model
    model = xgb.XGBRegressor(objective="reg:squarederror")

    # Define the hyperparameter grid for grid search
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'n_estimators': [50, 100, 200],
    }

    # Use GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3)
    grid_search.fit(X_train, y_train)

    # Get the best model from the grid search
    best_model = grid_search.best_estimator_

    # Make predictions on the test set using the best model
    predictions = best_model.predict(X_test)

    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    std_dev = np.std(y_test - predictions)  # Standard deviation of residuals

    return {
        "y_test": y_test,
        "predictions": predictions,
        "mse": mse,
        "mae": mae,
        "r2": r2,
        "std_dev": std_dev,
        "best_params": grid_search.best_params_
    }

In [3]:
import boto3
s3 = boto3.client('s3')
itemid = 51221
print(itemid)
bucket_name = 'sagemaker-studio-905418013525-nvxe84zgs6'
file_path = f"Labrado/alllabs1000adm/alllabs1000adm_{itemid}.csv"
obj = s3.get_object(Bucket=bucket_name, Key=file_path)
rawdata2 = pd.read_csv(obj['Body'])
rawdata = rawdata2[~rawdata2['valuenum'].isna()]

51221


In [7]:
itemid

51221

In [6]:
rawdata.head()

Unnamed: 0.1,Unnamed: 0,hadm_id,chartyear,valuenum,charttime_diff_hours,50971,50971_dt,50983,50983_dt,50902,...,51088,51088_dt,51795,51795_dt,51068,51068_dt,51105,51105_dt,50881,50881_dt
0,11,20000019,2159,26.5,0,3.5,0.0,137.0,0.0,103.0,...,,,,,,,,,,
1,28,20000019,2159,28.1,24,3.7,0.0,136.0,0.0,100.0,...,,,,,,,,,,
2,43,20000019,2159,23.9,49,3.9,0.0,138.0,0.0,102.0,...,,,,,,,,,,
3,73,20000019,2159,24.4,52,3.9,3.0,138.0,3.0,102.0,...,,,,,,,,,,
4,91,20000024,2151,32.1,0,5.2,0.0,140.0,0.0,104.0,...,,,,,,,,,,


In [8]:
sinx = 5
correlation = rawdata[rawdata.columns[5::2]].corrwith(rawdata['valuenum'])
# Sort the absolute correlation values in descending order
correlation = correlation.abs().sort_values(ascending=False)
# Select the top features based on correlation coefficient
top_features = correlation[correlation>0].index.tolist()
names_with_dt = [(i, i + '_dt') for i in top_features]
feature_cols = [item for sublist in names_with_dt for item in sublist]

  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [14]:
correlation

51085    0.974197
51222    0.960204
51279    0.909245
51221    0.900372
51706    0.772290
           ...   
51425         NaN
51426         NaN
51505         NaN
51946         NaN
51945         NaN
Length: 375, dtype: float64

In [9]:
len(feature_cols)

576

In [10]:
condition = rawdata['chartyear'] <= 2178
X_train = rawdata[feature_cols][condition]
X_test = rawdata[feature_cols][~condition]
y_train = rawdata['valuenum'][condition]
y_test = rawdata['valuenum'][~condition]
print("Current time:", datetime.now())
model_results = train_xgboost_model(X_train, X_test, y_train, y_test)
print("Current time:", datetime.now())
print(model_results)

Current time: 2024-05-13 17:52:25.860570
Current time: 2024-05-13 18:56:34.087888
{'y_test': 10         38.4
11         38.2
12         39.0
53         37.8
54         27.7
           ... 
2041598    23.1
2041685    45.0
2041686    43.1
2041687    40.3
2041688    37.5
Name: valuenum, Length: 400297, dtype: float64, 'predictions': array([38.4217  , 38.19156 , 38.95583 , ..., 38.550613, 40.230312,
       37.56675 ], dtype=float32), 'mse': 0.5283001752463896, 'mae': 0.22708373333689702, 'r2': 0.9858484382755082, 'std_dev': 0.7268415126115776, 'best_params': {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}}


In [11]:
print(model_results)

{'y_test': 10         38.4
11         38.2
12         39.0
53         37.8
54         27.7
           ... 
2041598    23.1
2041685    45.0
2041686    43.1
2041687    40.3
2041688    37.5
Name: valuenum, Length: 400297, dtype: float64, 'predictions': array([38.4217  , 38.19156 , 38.95583 , ..., 38.550613, 40.230312,
       37.56675 ], dtype=float32), 'mse': 0.5283001752463896, 'mae': 0.22708373333689702, 'r2': 0.9858484382755082, 'std_dev': 0.7268415126115776, 'best_params': {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}}


In [13]:
y_last = X_test[f"{itemid}"]
y_pred = model_results['predictions']
valid_pos = y_last.notna()


mse_last = mean_squared_error(y_test[valid_pos], y_last[valid_pos])
r2_last = r2_score(y_test[valid_pos], y_last[valid_pos])
mse_pred = mean_squared_error(y_test[valid_pos], y_pred[valid_pos])
r2_pred = r2_score(y_test[valid_pos], y_pred[valid_pos])



normalranges = {
  "Crt": (0.7, 1.3),
  "Plt": (150,450),
  "Hgb": (12, 18),
  "Wbc": (4, 11),
  "Ure": (8, 20),
  "50983": (136, 145),
  "Bic": (23, 28),
  "50971": (3.5, 5),
  "50902": (98, 106),
  "51221": (37,50),
}

bins = [-1,normalranges[f"{itemid}"][0],normalranges[f"{itemid}"][1],10000]
# Define labels for the categories
labels = ['Low', 'Medium', 'High']
y_test_cat = pd.cut(y_test[valid_pos],bins=bins, labels=labels).astype(str)
y_last_cat = pd.cut(y_last[valid_pos],bins=bins, labels=labels).astype(str)
y_pred_cat = pd.cut(y_pred[valid_pos],bins=bins, labels=labels).astype(str)
print(itemid)
print("normal range", bins)
print("test data shape",X_test.shape)
print("nearest neighbour mse ",round(mse_last,3),"and r2: ",round(r2_last,3))
print("xgboost imputation mse ",round(mse_pred,3),"and r2: ", round(r2_pred,3))
cm_last = confusion_matrix(y_test_cat,y_last_cat,labels=labels), 
print("confusion matrix for nearest neighbor")
print(cm_last)
cm_pred = confusion_matrix(y_test_cat,y_pred_cat,labels=labels)
print("confusion matrix for xgboost")
print(cm_pred)

51221
normal range [-1, 37, 50, 10000]
test data shape (400297, 576)
nearest neighbour mse  6.873 and r2:  0.804
xgboost imputation mse  0.475 and r2:  0.986
confusion matrix for nearest neighbor
(array([[308039,  13148,     43],
       [ 13926,  44022,    337],
       [    38,    356,    488]]),)
confusion matrix for xgboost
[[319981   1248      1]
 [  1174  57066     45]
 [     2     45    835]]
