### Stroop Test ML Analysis

This notebook explores how we can predict user attention states (like being focused or distracted) based on reaction time, accuracy, and metadata collected during the Stroop test.

---

In [5]:
#### 🔧 Step 1: Imports and Data Loading

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the data
file_path = "data/fake_results_for_ml.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,trial,word,font_color,response,correct,reaction_time,congruent,tiredness,caffeine,focus,adhd,timestamp,session_id
0,1,Yellow,Blue,Yellow,False,1.306,False,10,Yes,1,Yes,20250710_150809,2ae1a69f_20250710_150809
1,2,Yellow,Yellow,Blue,False,1.838,True,10,Yes,1,Yes,20250710_150809,2ae1a69f_20250710_150809
2,3,Green,Blue,Red,False,1.091,False,10,Yes,1,Yes,20250710_150809,2ae1a69f_20250710_150809
3,4,Blue,Green,Blue,False,0.342,False,10,Yes,1,Yes,20250710_150809,2ae1a69f_20250710_150809
4,5,Blue,Blue,Yellow,False,2.044,True,10,Yes,1,Yes,20250710_150809,2ae1a69f_20250710_150809


In [6]:
#### 🧼 Step 2: Data Cleaning
# Encode categorical variables
df["caffeine"] = df["caffeine"].map({"Yes": 1, "No": 0})
df["adhd"] = df["adhd"].map({"Yes": 1, "No": 0, "Prefer not to say": -1})
df["congruent"] = df["congruent"].astype(int)
df["correct"] = df["correct"].astype(int)

# Group by session
session_df = df.groupby("session_id").agg({
    "reaction_time": "mean",
    "correct": "mean",
    "congruent": "mean",
    "tiredness": "first",
    "focus": "first",
    "caffeine": "first",
    "adhd": "first"
}).reset_index()

# Create attention_state label
session_df["attention_state"] = session_df.apply(
    lambda row: "distracted" if (row["correct"] < 0.7 or row["reaction_time"] > 2.5 or row["focus"] < 4) else "focused",
    axis=1
)


In [7]:
#### 🧠 Step 4: Train ML Model
from sklearn.ensemble import RandomForestClassifier
import joblib
import os

# Features and label
X = session_df.drop(columns=["session_id", "attention_state"])
y = session_df["attention_state"]

# Train-test split (optional here)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate (optional)
print(classification_report(y_test, rf.predict(X_test)))

# Save model
os.makedirs("model", exist_ok=True)
joblib.dump(rf, "model/stroop_rf_model.pkl")
print("🎉 Model trained and saved to `model/stroop_rf_model.pkl`!")


              precision    recall  f1-score   support

  distracted       1.00      1.00      1.00        20

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

🎉 Model trained and saved to `model/stroop_rf_model.pkl`!
