In [19]:
import pandas as pd
import numpy as np

In [20]:
df = pd.read_csv("coffee_cognition_realistic_700.csv")

In [21]:
df.head()

Unnamed: 0,Participant_ID,Age,Gender,Brewing_Method,Time_of_Day,Sleep_Hours,Sleep_Quality_Score,Stress_Level,Physical_Activity_Level,Caffeine_mg,Stroop_Reaction_Time_ms,PVT_Reaction_Time_ms,N_Back_Accuracy,Focus_Level
0,P2000,63,Male,Drip,Evening,8.252212,6.414901,3.79523,3.436774,193.739889,588.863994,180.0,0.685505,1.0
1,P2001,20,Female,French Press,Afternoon,9.5,6.017315,2.822268,2.769816,142.806219,789.798695,227.346446,0.643084,0.835429
2,P2002,46,Male,Cold Brew,Morning,7.481754,7.873164,4.35866,9.246899,194.676359,627.18773,260.7798,0.834411,0.91038
3,P2003,52,Female,Drip,Morning,6.013837,6.35397,10.0,4.209916,151.325693,690.477498,251.77071,0.759299,0.770082
4,P2004,56,Male,Espresso,Morning,6.877482,8.602858,7.757663,4.896166,80.438954,677.910485,309.438359,0.697677,0.66801


In [22]:
coffee_features = ["Brewing_Method", "Caffeine_mg", "Time_of_Day"]

lifestyle_features = ["Sleep_Hours", "Sleep_Quality_Score", "Stress_Level", "Physical_Activity_Level"]

demographic_features = ["Age", "Gender"]

targets = ["Stroop_Reaction_Time_ms", "PVT_Reaction_Time_ms", "N_Back_Accuracy", "Focus_Level"]

In [23]:
df_encoded = pd.get_dummies(
    df,
    columns = ["Brewing_Method", "Time_of_Day", "Gender"],
    drop_first = True
)

In [24]:
from sklearn.preprocessing import StandardScaler, RobustScaler

scaler = StandardScaler()
robust = RobustScaler()

numeric_features = [
    "Age",
    "Caffeine_mg",
    "Sleep_Hours",
    "Sleep_Quality_Score",
    "Stress_Level",
    "Physical_Activity_Level"
]

df_encoded[numeric_features] = robust.fit_transform(df_encoded[numeric_features])

In [25]:
df_encoded["Caffeine_x_SleepQuality"] = (
    df_encoded["Caffeine_mg"] * df_encoded["Sleep_Quality_Score"]
)

In [26]:
df_encoded.head()

Unnamed: 0,Participant_ID,Age,Sleep_Hours,Sleep_Quality_Score,Stress_Level,Physical_Activity_Level,Caffeine_mg,Stroop_Reaction_Time_ms,PVT_Reaction_Time_ms,N_Back_Accuracy,Focus_Level,Brewing_Method_Drip,Brewing_Method_Espresso,Brewing_Method_French Press,Brewing_Method_Pour Over,Time_of_Day_Evening,Time_of_Day_Morning,Gender_Male,Caffeine_x_SleepQuality
0,P2000,0.956522,0.852445,-0.003767,-0.707241,-0.781066,0.924902,588.863994,180.0,0.685505,1.0,True,False,False,False,True,False,True,-0.003485
1,P2001,-0.913043,1.581432,-0.182437,-1.061402,-0.99246,0.255743,789.798695,227.346446,0.643084,0.835429,False,False,True,False,False,False,False,-0.046657
2,P2002,0.217391,0.402326,0.651556,-0.50215,1.060463,0.937205,627.18773,260.7798,0.834411,0.91038,False,False,False,False,False,True,True,0.610642
3,P2003,0.478261,-0.455266,-0.031149,1.551318,-0.536018,0.36767,690.477498,251.77071,0.759299,0.770082,True,False,False,False,False,True,False,-0.011452
4,P2004,0.652174,0.049296,0.97947,0.735099,-0.318509,-0.563629,677.910485,309.438359,0.697677,0.66801,False,True,False,False,False,True,True,-0.552058


In [27]:
df_encoded["Caffeine_x_Stress"] = (
    df_encoded["Caffeine_mg"] * df_encoded["Stress_Level"]
)

In [28]:
if "Time_of_Day_Evening" in df_encoded.columns:
    df_encoded["Caffeine_x_Time_of_Day"] = (
        df_encoded["Caffeine_mg"] * df_encoded["Time_of_Day_Evening"]
    )

In [32]:
df_encoded["Caffeine_sq"] = df_encoded["Caffeine_mg"] ** 2
df_encoded.head()
df_encoded.shape

(700, 22)

In [33]:
df_encoded[["Caffeine_mg", "Caffeine_sq", "Caffeine_x_SleepQuality"]].corr()

Unnamed: 0,Caffeine_mg,Caffeine_sq,Caffeine_x_SleepQuality
Caffeine_mg,1.0,0.58663,-0.104306
Caffeine_sq,0.58663,1.0,-0.137354
Caffeine_x_SleepQuality,-0.104306,-0.137354,1.0


In [34]:
x = df_encoded.drop(columns=targets)
y_stroop = df_encoded["Stroop_Reaction_Time_ms"]
y_pvt = df_encoded["PVT_Reaction_Time_ms"]
y_nback = df_encoded["N_Back_Accuracy"]
y_focus = df_encoded["Focus_Level"]

In [35]:
x.shape

(700, 18)

In [36]:
df_encoded.to_csv("coffee_cognition_realistic_processed.csv", index=False)