In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
import math
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score

warnings.filterwarnings('ignore')


In [73]:
# dataframe from Notebook.ipynb that hosts glucose levels and step count for each user
raw_df = pd.read_csv("./Datasets/glucose_steps.csv")

In [74]:
# replaces all NaN steps with 0 (no steps) and all NaN glucose with the user's rolling mean. Some steps occur before glucose is calculated, thus the user's global mean is used
def fix_NaN(df):
    df = df.sort_values(["user_id", "timestamp"])

    df["val1"] = df["val1"].fillna(0)

    df["glucose_running_mean"] = (df.groupby("user_id", group_keys=False)["glucose"].apply(lambda x: x.expanding().mean()))

    df["glucose"] = df["glucose"].fillna(df["glucose_running_mean"])
    global_mean = df["glucose"].mean()
    df["glucose"] = df["glucose"].fillna(global_mean)


    df.drop(columns=["glucose_running_mean"], inplace=True)

    return df

In [75]:
# encodes user_id as a category, specifies X and Y values.
df = fix_NaN(raw_df)
#df.info()
df["user_id"] = df["user_id"].astype("category") 
df["user_id_enc"] = df["user_id"].cat.codes 

X = df[["val1", "user_id_enc"]]
y = df["glucose"]

In [76]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        # 20% goes to test
    random_state=42
)

model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# Fit the model
model.fit(X_train, y_train)

In [77]:
y_pred = model.predict(X_test)

# metrics, F-1 is used in the classification file
mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
r2 = sklearn.metrics.r2_score(y_test, y_pred)

print("Random Forest Regression Results:")
print("MSE:", mse)
print("RMSE:", math.sqrt(mse))
print("R^2:", r2)

Random Forest Regression Results:
MSE: 438.82065399320965
RMSE: 20.948046543609017
R^2: 0.2695402685195145
