In [28]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import xgboost as xgb
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [21]:
# dataframe from Notebook.ipynb that hosts glucose levels and step count for each user
# replaces all NaN steps with 0 (no steps) and all NaN glucose with the user's rolling mean. Some steps occur before glucose is calculated, thus the user's global mean is used

raw_df = pd.read_csv("../Datasets/glucose_steps.csv")
def fix_NaN(df):
    df = df.sort_values(["user_id", "timestamp"])

    df["val1"] = df["val1"].fillna(0)

    df["glucose_running_mean"] = (df.groupby("user_id", group_keys=False)["glucose"].apply(lambda x: x.expanding().mean()))

    df["glucose"] = df["glucose"].fillna(df["glucose_running_mean"])
    global_mean = df["glucose"].mean()
    df["glucose"] = df["glucose"].fillna(global_mean)


    df.drop(columns=["glucose_running_mean"], inplace=True)

    return df

In [22]:
# encodes user_id as a category

df = fix_NaN(raw_df)
#df.info()
df["user_id"] = df["user_id"].astype("category") 
df["user_id_enc"] = df["user_id"].cat.codes 
df = df.sort_values(["user_id", "timestamp"])

In [23]:
df = df.sort_values(["user_id", "timestamp"])

# finds user baselines
df["baseline_glucose"] = df.groupby("user_id")["glucose"].transform("mean")
df["baseline_val1"] = df.groupby("user_id")["val1"].transform("mean")

# finds change from baseline
df["val1_delta"] = df["val1"] - df["baseline_val1"]

# encodes glucsoe levels of greater than 140 mg/dL as high (1) where normal is (0); this is the classification target
df["glucose_binary"] = (df["glucose"] > 140).astype(int)

  df["baseline_glucose"] = df.groupby("user_id")["glucose"].transform("mean")
  df["baseline_val1"] = df.groupby("user_id")["val1"].transform("mean")


In [30]:
train_dfs = []
test_dfs = []

for user in df["user_id"].unique():
    user_df = df[df["user_id"] == user].copy()
    cutoff = int(0.8 * len(user_df))  # partitions data into train and test (0.8, 0.2)
    user_train = user_df.iloc[:cutoff]
    user_test = user_df.iloc[cutoff:]
    train_dfs.append(user_train)
    test_dfs.append(user_test)

df_train = pd.concat(train_dfs)
df_test = pd.concat(test_dfs)

# could change val1 to val1_delta to take into account usual activity levels but I dont think thats the point. Also, could take into account baseline glucose but I also dont think thats the point
feature_cols = ["val1", "user_id_enc"]
X_train = df_train[feature_cols]
y_train = df_train["glucose_binary"]
X_test = df_test[feature_cols]
y_test = df_test["glucose_binary"]

xgb_train = xgb.DMatrix(X_train, y_train, enable_categorical=True)
xgb_test = xgb.DMatrix(X_test, y_test, enable_categorical=True)

n=50
params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'learning_rate': 0.1,
}

model = xgb.train(params=params,dtrain=xgb_train,num_boost_round=n)

preds = model.predict(xgb_test)
preds = preds.astype(int)
accuracy= accuracy_score(y_test,preds)
print('Accuracy of the model is:', accuracy*100)


print(sklearn.metrics.classification_report(y_test, preds))

# IMPORTANT: this model classifies specific instances of high blood sugar based on user-specific trends. Modifications must be made to predict if a user has diabetes or does not. The dataset does not say which user is diabetic but it is probably user 14 or 15. 

Accuracy of the model is: 91.06246514221974
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      6531
           1       0.00      0.00      0.00       641

    accuracy                           0.91      7172
   macro avg       0.46      0.50      0.48      7172
weighted avg       0.83      0.91      0.87      7172



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
