In [17]:
import pandas as pd
import numpy as np
from itertools import product

from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [19]:
# Constants
NUM_PATIENTS = 100  # Adjust as needed
ITEMIDS = [50912, 51265, 51222, 51301, 51006, 50983, 50882, 50971]
DAYS = pd.date_range(start="2020-01-01", periods=30)  # Adjust as needed
HOURS = range(24)  # Assuming every hour

# Generate Complete Grid
all_combinations = list(product(range(1000, 1000 + NUM_PATIENTS), DAYS, HOURS, ITEMIDS))

# Create DataFrame from Combinations with Random Values
complete_data = pd.DataFrame(
    all_combinations, columns=["subject_id", "chartday", "charthour", "stay_id"]
)
complete_data["valuenum"] = np.random.uniform(0, 100, size=len(all_combinations))

# Pivot to Wide Format
df_wide = complete_data.pivot_table(
    index=["subject_id", "chartday", "charthour"],
    columns="stay_id",
    values="valuenum",
    aggfunc="first",
).reset_index()

df_wide.head()

stay_id,subject_id,chartday,charthour,50882,50912,50971,50983,51006,51222,51265,51301
0,1000,2020-01-01,0,35.522285,10.316356,64.033938,44.592842,93.889182,57.15839,48.380744,26.849731
1,1000,2020-01-01,1,80.284648,44.395427,91.560461,67.946133,18.949678,47.203333,32.079435,51.09914
2,1000,2020-01-01,2,65.549602,87.484138,57.754636,16.381731,46.793972,18.654142,97.696428,43.43011
3,1000,2020-01-01,3,69.416609,10.493783,69.879887,14.686553,11.062942,0.585439,97.187067,67.702461
4,1000,2020-01-01,4,88.830144,15.369267,18.831549,39.930305,1.248704,11.452894,50.117789,94.884114


In [20]:
def train_xgboost_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    model = xgb.XGBRegressor(objective="reg:squarederror")
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    return {
        "y_test": y_test,
        "predictions": predictions,
        "mse": mse,
        "mae": mae,
        "r2": r2,
    }

In [21]:
X = df_wide.iloc[:, 3:-1]
y = df_wide.iloc[:, -1]

In [22]:
# Train the model and get the results
results = train_xgboost_model(X, y)

# You can now print the results or further analyze them
print("Mean Squared Error:", results["mse"])
print("Mean Absolute Error:", results["mae"])
print("R-squared:", results["r2"])

Mean Squared Error: 870.82992402363
Mean Absolute Error: 25.348608739302286
R-squared: -0.04466796920863869
