# Cognitive Load Detection Using fNIRS & Machine Learning

This notebook processes fNIRS brain signal data to detect a subject’s cognitive load level in real time and adapt educational content accordingly using both static templates and OpenAI GPT-4.

**Key Features:**
- Feature extraction from raw fNIRS chunks
- Classification of cognitive load levels (low/medium/high)
- Real-time simulation
- Static and user-typed GPT prompt adaptation


## # -*- coding: utf-8 -*-

In [None]:


from ipywidgets import interact

window_size = 300  # adjust based on actual sampling rate
num_samples = df.shape[0]
n_windows = num_samples // window_size

## Feature Extraction ===

In [None]:
def extract_features(df):
    feature_row = {}
    for col in df.columns:
        if col != "label":
            signal = df[col].values
            feature_row[f"{col}_mean"] = np.mean(signal)
            feature_row[f"{col}_std"] = np.std(signal)
            feature_row[f"{col}_slope"] = np.polyfit(range(len(signal)), signal, 1)[0]
            feature_row[f"{col}_auc"] = np.trapz(signal)
    return feature_row


import os
import glob
import pandas as pd
import numpy as np

data_folder = "/content/drive/MyDrive/FNIRS TUFTS/Band pass filtered"
all_files = glob.glob(os.path.join(data_folder, "*.csv"))

## Feature Extraction ===

In [None]:
def extract_features(df):
    feature_row = {}
    for col in df.columns:
        if col != "label":
            signal = df[col].values
            feature_row[f"{col}_mean"] = np.mean(signal)
            feature_row[f"{col}_std"] = np.std(signal)
            feature_row[f"{col}_slope"] = np.polyfit(range(len(signal)), signal, 1)[0]
            feature_row[f"{col}_auc"] = np.trapz(signal)
    return feature_row

window_size = 300
feature_list = []
label_list = []

for file in all_files:
    df = pd.read_csv(file)
    num_samples = df.shape[0]
    n_windows = num_samples // window_size

    for i in range(n_windows):
        window_df = df.iloc[i*window_size : (i+1)*window_size]
        if len(window_df) < window_size:
            continue
        features = extract_features(window_df)
        label = int(window_df["label"].iloc[0])
        feature_list.append(features)
        label_list.append(label)

from sklearn.model_selection import train_test_split

X = pd.DataFrame(feature_list).select_dtypes(include="number")
y = pd.Series(label_list).astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

from sklearn.ensemble import RandomForestClassifier

## Model Training ===

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = clf.predict(X_test)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

## Visualization ===

In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Cognitive Load - Confusion Matrix")
plt.show()

from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = clf.predict(X_test)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

## Visualization ===

In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Cognitive Load - Confusion Matrix")
plt.show()

importances = clf.feature_importances_
features = X.columns

# Plot top 10 features
sorted_idx = importances.argsort()[::-1][:10]

## Visualization ===

In [None]:
plt.figure(figsize=(8, 4))
sns.barplot(x=importances[sorted_idx], y=features[sorted_idx])
plt.title("🔍 Top 10 Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

from sklearn.model_selection import StratifiedKFold, cross_val_score
skf = StratifiedKFold(n_splits=5)

## Model Training ===

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
scores = cross_val_score(clf, X, y, cv=skf)

print("Cross-val accuracy (5-fold):", scores.mean())

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

clf.fit(X_resampled, y_resampled)

from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

print("📊 SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

sns.heatmap(confusion_matrix(y_test, y_pred_svm), annot=True, fmt="d", cmap="Blues")
plt.title("🧠 SVM - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)

print("Gradient Boosting Report:")
print(classification_report(y_test, y_pred_gb))

sns.heatmap(confusion_matrix(y_test, y_pred_gb), annot=True, fmt="d", cmap="Greens")
plt.title(" Gradient Boosting - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

from sklearn.metrics import accuracy_score

models = {
    "Random Forest": clf,
    "SVM": svm_model,
    "Gradient Boosting": gb_model
}

for name, model in models.items():
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    print(f"{name}: Accuracy = {acc:.2f}")

## Prompt Generator ===

In [None]:
def generate_prompt(load_level, topic="Gravity"):
    if load_level == 3:  # High cognitive load → simplify
        return f"Explain {topic} using simple language, short sentences, and bullet points. Avoid technical terms."
    elif load_level == 2:
        return f"Explain {topic} with clear examples and simple vocabulary. Use paragraph form but keep it light."
    elif load_level == 1:
        return f"Give a moderately detailed explanation of {topic} suitable for a middle school student."
    else:  # load_level == 0 → low load → challenge them
        return f"Explain {topic} in a detailed way using high school-level vocabulary and relevant scientific terminology."

# Assuming 'predicted_class' is the output from your classifier
predicted_class = clf.predict(X_test)[0]

## Prompt Generator ===

In [None]:
prompt = generate_prompt(predicted_class, topic="Gravity")
print(" Predicted Load:", predicted_class)
print(" ChatGPT Prompt:", prompt)


import openai

client = openai.OpenAI(api_key="...")

## GPT-4 API Call ===

In [None]:
response = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are a helpful science tutor."},
        {"role": "user", "content": prompt}
    ]
)

print("💬 ChatGPT Response:\n", response.choices[0].message.content)

text_bank = {
    "gravity": {
        0: "Gravity is a fundamental force of nature described by general relativity, responsible for the curvature of spacetime around massive objects.",
        1: "Gravity is a natural force that pulls objects toward one another. It's what keeps the planets in orbit around the sun.",
        2: "Gravity pulls things down, like how an apple falls from a tree.",
        3: "Gravity makes things fall to the ground."
    },
    "photosynthesis": {
        0: "Photosynthesis is the biochemical process by which green plants, algae, and some bacteria convert light energy into chemical energy, producing glucose and oxygen from carbon dioxide and water.",
        1: "Photosynthesis is how plants use sunlight to make their own food. They take in carbon dioxide and water to create oxygen and sugar.",
        2: "Plants use sunlight to make food and give us oxygen.",
        3: "Plants eat sunlight and help us breathe."
    },
    "climate change": {
        0: "Climate change refers to long-term alterations in temperature and weather patterns, primarily caused by anthropogenic greenhouse gas emissions affecting the Earth’s energy balance.",
        1: "Climate change is when Earth's weather becomes different over time, mainly due to human activities like burning fossil fuels.",
        2: "The Earth is getting warmer because of pollution from cars and factories.",
        3: "The Earth is getting hotter."
    },
    "nervous system": {
        0: "The nervous system comprises the central and peripheral systems, transmitting electrical and chemical signals between the brain, spinal cord, and body to regulate physiological processes.",
        1: "The nervous system is how your brain and body communicate through nerves and signals.",
        2: "Your brain sends messages to your body to move and feel things.",
        3: "Your brain tells your body what to do."
    }
}

display_adapted_output(clf, X_test, "nervous system")

text_bank = {
    "photosynthesis": {...},
    "gravity": {...},
    "climate change": {...},
    "nervous system": {...}
}

def display_adapted_output(model, X_sample, topic):
    predicted_class = model.predict(X_sample)[0]

## Prompt Generator ===

In [None]:
    prompt = generate_prompt(predicted_class, topic)
    text = get_adapted_text(predicted_class, topic)

    print(f" Predicted Load Level: {predicted_class}")
    print(f" ChatGPT Prompt:\n{prompt}")
    print(f"\n Adapted Static Text:\n{text}")

# Usage
display_adapted_output(clf, X_test, "gravity")

@interact(level=[0, 1, 2, 3], topic=["photosynthesis", "gravity", "climate change","nervous system"])
def test_text(level, topic):
    print(f"Selected Topic: {topic}")
    print(f"Load Level: {level}")
    print("Adapted Science Text:\n")
    print(get_adapted_text(level, topic))

#Level Generalization

all_files = glob.glob(os.path.join(data_folder, "*.csv"))

subject_data = {}  # key: subject ID or filename, value: dataframe

for file in all_files:
    subject_id = os.path.basename(file).split(".")[0]  # get filename without extension
    df = pd.read_csv(file)
    subject_data[subject_id] = df

#Leave-One-Subject-Out Split
from sklearn.model_selection import LeaveOneOut

subjects = list(subject_data.keys())
loo = LeaveOneOut()

## Feature Extraction ===

In [None]:
def extract_features_and_labels(df, chunk_size=50):
    import numpy as np

    features = []
    labels = []

    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size]
        if len(chunk) < chunk_size:
            continue  # skip incomplete chunks

        # Drop non-signal columns if needed (like 'label')
        signal_cols = [col for col in chunk.columns if col != "label"]

        # Feature extraction: mean, std, min, max
        feature_dict = {}
        for col in signal_cols:
            feature_dict[f"{col}_mean"] = chunk[col].mean()
            feature_dict[f"{col}_std"] = chunk[col].std()
            feature_dict[f"{col}_min"] = chunk[col].min()
            feature_dict[f"{col}_max"] = chunk[col].max()

        features.append(feature_dict)

        # Label for this chunk: use majority or mode
        labels.append(chunk["label"].mode()[0])  # safest choice

    return pd.DataFrame(features), labels

#LOSO loop

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

results = []

for train_idx, test_idx in loo.split(subjects):
    train_subjects = [subjects[i] for i in train_idx]
    test_subject = subjects[test_idx[0]]

    # Combine train data
    train_df = pd.concat([subject_data[s] for s in train_subjects])
    test_df = subject_data[test_subject]

    # Feature extraction (same as before)
    X_train, y_train = extract_features_and_labels(train_df)
    X_test, y_test = extract_features_and_labels(test_df)

    # Train model
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)

    # Evaluate
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    results.append({
        "test_subject": test_subject,
        "accuracy": acc,
        "report": classification_report(y_test, y_pred, output_dict=True)
    })
    print(f"Tested on {test_subject} | Accuracy: {acc:.2f}")

import pandas as pd

results_df = pd.DataFrame(results)
print("Average Accuracy:", results_df["accuracy"].mean())

## Feature Extraction ===

In [None]:
def extract_features_and_labels(df):
    # Split df into time windows (e.g., 50 rows per chunk)
    chunks = [df.iloc[i:i+50] for i in range(0, len(df), 50) if len(df.iloc[i:i+50]) == 50]

    features = []
    labels = []

    for chunk in chunks:
        # Extract features from each chunk (mean, std, etc.)
        f = extract_features(chunk)  # reuse your existing method
        features.append(f)

        # Get label from chunk (assuming 1 label per chunk)
        labels.append(chunk["label"].mode()[0])  # or majority vote

    return pd.DataFrame(features), labels

REAL TIME PIPELINE.

import os, glob
import pandas as pd

data_folder = "/content/drive/MyDrive/FNIRS TUFTS/Band pass filtered"
all_files = glob.glob(os.path.join(data_folder, "*.csv"))

subject_data = {}
for file in all_files:
    subject_id = os.path.basename(file).split(".")[0]
    df = pd.read_csv(file)
    subject_data[subject_id] = df

X_train, y_train = extract_features_and_labels(train_df)

from sklearn.ensemble import RandomForestClassifier

## Model Training ===

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

import time

chunk_size = 50  # based on your sampling rate
test_chunks = [test_df.iloc[i:i+chunk_size] for i in range(0, len(test_df), chunk_size)]

for chunk in test_chunks:
    if len(chunk) < chunk_size:
        continue

    features = extract_features(chunk)
    X = pd.DataFrame([features])
    predicted_class = clf.predict(X)[0]

    # Text adaptation
    topic = "gravity"

## Prompt Generator ===

In [None]:
    prompt = generate_prompt(predicted_class, topic)
    static_text = get_adapted_text(predicted_class, topic)

    print(f" Predicted Load Level: {predicted_class}")
    print(f" GPT Prompt: {prompt}")
    print(f" Adapted Text:\n{static_text}")

    time.sleep(0.1)  # simulate live delay

## Feature Extraction ===

In [None]:
def extract_features_and_labels(df):
    chunk_size = 50
    chunks = [df.iloc[i:i+chunk_size] for i in range(0, len(df), chunk_size) if len(df.iloc[i:i+chunk_size]) == chunk_size]

    features, labels = [], []
    for chunk in chunks:
        f = extract_features(chunk)
        features.append(f)
        labels.append(chunk["label"].mode()[0])  # or majority vote

    return pd.DataFrame(features), labels

predictions = []
timepoints = []
true_labels = []  # optional, if your test data has labels

for i, chunk in enumerate(test_chunks):
    if len(chunk) < chunk_size:
        continue

    features = extract_features(chunk)
    X = pd.DataFrame([features])
    predicted_class = clf.predict(X)[0]

    predictions.append(predicted_class)
    timepoints.append(i)  # or use actual time if available

    if "label" in chunk.columns:
        true_labels.append(chunk["label"].mode()[0])

    # Your current adaptation block
    ...

import matplotlib.pyplot as plt

## Visualization ===

In [None]:
plt.figure(figsize=(12, 4))

## Visualization ===

In [None]:
plt.plot(timepoints, predictions, label="Predicted Load", marker='o')

# Optional: plot ground truth if available
if true_labels:

## Visualization ===

In [None]:
    plt.plot(timepoints, true_labels, label="True Load", linestyle='--', alpha=0.6)

plt.xlabel("Time Window")
plt.ylabel("Cognitive Load Level")
plt.title(f"🧠 Cognitive Load Over Time – {test_subject}")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()