In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
import math
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score

warnings.filterwarnings('ignore')

In [7]:
# dataframe from Notebook.ipynb that hosts glucose levels and step count for each user
# replaces all NaN steps with 0 (no steps) and all NaN glucose with the user's rolling mean. Some steps occur before glucose is calculated, thus the user's global mean is used

raw_df = pd.read_csv("group_1/Datasets/glucose_steps.csv")
def fix_NaN(df):
    df = df.sort_values(["user_id", "timestamp"])

    df["val1"] = df["val1"].fillna(0)

    df["glucose_running_mean"] = (df.groupby("user_id", group_keys=False)["glucose"].apply(lambda x: x.expanding().mean()))

    df["glucose"] = df["glucose"].fillna(df["glucose_running_mean"])
    global_mean = df["glucose"].mean()
    df["glucose"] = df["glucose"].fillna(global_mean)


    df.drop(columns=["glucose_running_mean"], inplace=True)

    return df

In [8]:
# encodes user_id as a category

df = fix_NaN(raw_df)
#df.info()
df["user_id"] = df["user_id"].astype("category") 
df["user_id_enc"] = df["user_id"].cat.codes 
df = df.sort_values(["user_id", "timestamp"])

In [9]:
df = df.sort_values(["user_id", "timestamp"])

# finds user baselines
df["baseline_glucose"] = df.groupby("user_id")["glucose"].transform("mean")
df["baseline_val1"] = df.groupby("user_id")["val1"].transform("mean")

# finds change from baseline
df["val1_delta"] = df["val1"] - df["baseline_val1"]

# encodes glucsoe levels of greater than 140 mg/dL as high (1) where normal is (0); this is the classification target
df["glucose_binary"] = (df["glucose"] > 140).astype(int)

In [10]:
train_dfs = []
test_dfs = []

for user in df["user_id"].unique():
    user_df = df[df["user_id"] == user].copy()
    cutoff = int(0.8 * len(user_df))  # partitions data into train and test (0.8, 0.2)
    user_train = user_df.iloc[:cutoff]
    user_test = user_df.iloc[cutoff:]
    train_dfs.append(user_train)
    test_dfs.append(user_test)

df_train = pd.concat(train_dfs)
df_test = pd.concat(test_dfs)

# could change val1 to val1_delta to take into account usual activity levels but I dont think thats the point. Also, could take into account baseline glucose but I also dont think thats the point
feature_cols = ["val1", "user_id_enc"]
X_train = df_train[feature_cols]
y_train = df_train["glucose_binary"]
X_test = df_test[feature_cols]
y_test = df_test["glucose_binary"]

# sets up classifier
clf = RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",
    random_state=42
)

clf.fit(X_train, y_train)

# uses classifier to predict glucose binaries based on steps
y_pred = clf.predict(X_test)

print(sklearn.metrics.classification_report(y_test, y_pred))

# IMPORTANT: this model classifies specific instances of high blood sugar based on user-specific trends. Modifications must be made to predict if a user has diabetes or does not. The dataset does not say which user is diabetic but it is probably user 14 or 15. 

              precision    recall  f1-score   support

           0       0.96      0.71      0.82      6531
           1       0.19      0.70      0.30       641

    accuracy                           0.71      7172
   macro avg       0.58      0.71      0.56      7172
weighted avg       0.89      0.71      0.77      7172

