In [2]:
!pip install xgboost
!pip install s3fs

Collecting s3fs
  Using cached s3fs-2024.3.1-py3-none-any.whl.metadata (1.6 kB)
Collecting fsspec==2024.3.1 (from s3fs)
  Using cached fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Using cached s3fs-2024.3.1-py3-none-any.whl (29 kB)
Using cached fsspec-2024.3.1-py3-none-any.whl (171 kB)
Installing collected packages: fsspec, s3fs
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2023.6.0
    Uninstalling fsspec-2023.6.0:
      Successfully uninstalled fsspec-2023.6.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-ai 2.12.0 requires faiss-cpu, which is not installed.
datasets 2.18.0 requires fsspec[http]<=2024.2.0,>=2023.1.0, but you have fsspec 2024.3.1 which is incompatible.
jupyter-scheduler 2.5.1 requires fsspec==2023.6.0, but you have fsspec 2024.3.1 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-20

In [3]:
import matplotlib.pyplot as plt
import s3fs
import pandas as pd
import numpy as np
from itertools import product
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split,train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,mean_squared_error, mean_absolute_error, r2_score,confusion_matrix
from sklearn.preprocessing import MinMaxScaler

In [4]:
def train_xgboost_model(X_train, X_test, y_train, y_test):
    # Split the data into training and testing sets

    # Define the XGBoost model
    model = xgb.XGBRegressor(objective="reg:squarederror")

    # Define the hyperparameter grid for grid search
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'n_estimators': [50, 100, 200],
    }

    # Use GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3)
    grid_search.fit(X_train, y_train)

    # Get the best model from the grid search
    best_model = grid_search.best_estimator_

    # Make predictions on the test set using the best model
    predictions = best_model.predict(X_test)

    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    std_dev = np.std(y_test - predictions)  # Standard deviation of residuals

    return {
        "y_test": y_test,
        "predictions": predictions,
        "mse": mse,
        "mae": mae,
        "r2": r2,
        "std_dev": std_dev,
        "best_params": grid_search.best_params_
    }

In [None]:
LabName = 'Crt'
split_index = 2179
AllData = pd.read_csv(f"s3://sagemaker-studio-905418013525-nvxe84zgs6/Labrado/rl8/rl8_{LabName}.csv")
condition = AllData['chartyear'] <= split_index
TrainData = AllData[condition]
TestData = AllData[~condition]
feature_cols = ['Bic', 'Bic_timedif', 'Crt', 'Crt_timedif', 'Hgb', 'Hgb_timedif', 'Plt', 'Plt_timedif', 'Pot', 'Pot_timedif', 'Sod', 'Sod_timedif', 'Ure', 'Ure_timedif', 'Wbc', 'Wbc_timedif']
target_col = 'valuenum'
X_train = TrainData[feature_cols]
y_train = TrainData[target_col]
X_test = TestData[feature_cols]
y_test = TestData[target_col]
print("Current time:", datetime.now())
model_results = train_xgboost_model(X_train, X_test, y_train, y_test)
print("Current time:", datetime.now())
print(model_results)


y_last = AllData[LabName].iloc[model_results['y_test'].index]
y_test = model_results['y_test']
y_pred = model_results['predictions']
valid_pos = y_last.notna()


mse_last = mean_squared_error(y_test[valid_pos], y_last[valid_pos])
r2_last = r2_score(y_test[valid_pos], y_last[valid_pos])
mse_pred = mean_squared_error(y_test[valid_pos], y_pred[valid_pos])
r2_pred = r2_score(y_test[valid_pos], y_pred[valid_pos])
print(LabName)
print("nearest neighbour mse ",round(mse_last,3),"and r2: ",round(r2_last,3))
print("xgboost imputation mse ",round(mse_pred,3),"and r2: ", round(r2_pred,3))



normalranges = {
  "Crt": (0.7, 1.3),
  "Plt": (150,450),
  "Hgb": (12, 18),
  "Wbc": (4, 11),
  "Ure": (8, 20),
  "Sod": (136, 145),
  "Bic": (23, 28),
  "Pot": (3.5, 5),
}

bins = [-1,normalranges[LabName][0],normalranges[LabName][1],10000]
# Define labels for the categories
labels = ['Low', 'Medium', 'High']
y_test_cat = pd.cut(y_test[valid_pos],bins=bins, labels=labels).astype(str)
y_last_cat = pd.cut(y_last[valid_pos],bins=bins, labels=labels).astype(str)
y_pred_cat = pd.cut(y_pred[valid_pos],bins=bins, labels=labels).astype(str)
print(LabName)
print(bins)
print("nearest neighbour mse ",round(mse_last,3),"and r2: ",round(r2_last,3))
print("xgboost imputation mse ",round(mse_pred,3),"and r2: ", round(r2_pred,3))
cm_last = confusion_matrix(y_test_cat,y_last_cat,labels=labels), 
print("confusion matrix for nearest neighbor")
print(cm_last)
cm_pred = confusion_matrix(y_test_cat,y_pred_cat,labels=labels)
print("confusion matrix for xgboost")
print(cm_pred)

Current time: 2024-05-10 21:04:34.802981


In [6]:
print(LabName)
print(bins)
print("nearest neighbour mse ",round(mse_last,3),"and r2: ",round(r2_last,3))
print("xgboost imputation mse ",round(mse_pred,3),"and r2: ", round(r2_pred,3))
cm_last = confusion_matrix(y_test_cat,y_last_cat,labels=labels), 
print("confusion matrix for nearest neighbor")
print(cm_last)
cm_pred = confusion_matrix(y_test_cat,y_pred_cat,labels=labels)
print("confusion matrix for xgboost")
print(cm_pred)

Crt
[-1, 0.7, 1.3, 10000]
nearest neighbour mse  0.213 and r2:  0.918
xgboost imputation mse  0.151 and r2:  0.942
confusion matrix for nearest neighbor
(array([[ 87894,  13386,     75],
       [ 13187, 121787,   8296],
       [    66,   7402, 106249]]),)
confusion matrix for xgboost
[[ 73341  27944     70]
 [  5141 126140  11989]
 [    33   5214 108470]]
