Path of dataset

In [None]:
swell_df = '/content/drive/MyDrive/0_SWELL/Behavioral-features - per minute.xlsx'


Columns extraction from dataset

In [None]:
import pandas as pd

# 1Ô∏è‚É£ Load the Excel file
file_path = "/content/drive/MyDrive/0_SWELL/Behavioral-features - per minute.xlsx"
swell_df = pd.read_excel(file_path)

# 2Ô∏è‚É£ Define the column groups
client_info = ['PP', 'Blok', 'Condition', 'timestamp']
physio_features = ['HR', 'RMSSD', 'SCL', 'Svalence', 'Sneutral']   # 5 physiological signals
regression_labels = ['Stress', 'MentalEffort']

# 3Ô∏è‚É£ Filter only the selected columns that exist in your sheet
available_cols = [col for col in client_info + physio_features + regression_labels if col in swell_df.columns]

final_df = swell_df[available_cols].copy()

# 4Ô∏è‚É£ Drop rows with missing data
final_df.dropna(inplace=True)

# 5Ô∏è‚É£ Check the final dataset
print(f"Final DataFrame shape: {final_df.shape}")
display(final_df.head())

# Optional: save a CSV for reuse
final_df.to_csv("/content/drive/MyDrive/0_SWELL/final_features.csv", index=False)


Final DataFrame shape: (1192, 11)


Unnamed: 0,PP,Blok,Condition,timestamp,HR,RMSSD,SCL,Svalence,Sneutral,Stress,MentalEffort
7,PP1,1,N,20120918T132700000,74.0,0.037915,67.145289,0.017913,0.889371,4.1,5.1
8,PP1,1,N,20120918T132800000,65.0,0.066091,67.248375,0.045847,0.879852,4.1,5.1
9,PP1,1,N,20120918T132900000,67.0,0.045529,65.998403,0.019678,0.91307,4.1,5.1
10,PP1,1,N,20120918T133000000,70.0,0.042963,66.27938,0.041101,0.802018,4.1,5.1
11,PP1,1,N,20120918T133100000,69.0,0.05259,67.702527,0.05202,0.865314,4.1,5.1


Trial preprocess


In [None]:
import pandas as pd
import numpy as np

# 1Ô∏è‚É£ Load the SWELL dataset
file_path = "/content/drive/MyDrive/0_SWELL/Behavioral-features - per minute.xlsx"
df = pd.read_excel(file_path)

# 2Ô∏è‚É£ Select the relevant columns
client_info = ['PP', 'Blok', 'Condition', 'timestamp']
physio_features = ['HR', 'RMSSD', 'SCL', 'Svalence', 'Sneutral']  # 3‚Äì5 physiological features
regression_labels = ['Stress', 'MentalEffort']

df = df[client_info + physio_features + regression_labels].copy()

# 3Ô∏è‚É£ Parse timestamp (format like 20120918T132700000)
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%dT%H%M%S%f', errors='coerce')
df.dropna(subset=['timestamp'], inplace=True)
df.sort_values(by=['PP', 'timestamp'], inplace=True)
df.reset_index(drop=True, inplace=True)

# 4Ô∏è‚É£ Define window size (e.g. 5 minutes)
window_size = 300  # seconds

# Create window index per participant
df['window_id'] = df.groupby('PP')['timestamp'].transform(
    lambda x: ((x - x.min()).dt.total_seconds() // window_size).astype(int)
)

# 5Ô∏è‚É£ Aggregate by window: mean/std for features, mean for labels
agg_dict = {}
for f in physio_features:
    agg_dict[f'{f}_mean'] = (f, 'mean')
    agg_dict[f'{f}_std'] = (f, 'std')
for lbl in regression_labels:
    agg_dict[lbl] = (lbl, 'mean')

windowed_df = (
    df.groupby(['PP', 'Blok', 'Condition', 'window_id'])
      .agg(**agg_dict)
      .reset_index()
)

# 6Ô∏è‚É£ Rename for schema alignment
windowed_df.rename(columns={
    'PP': 'person',
    'Blok': 'session',
    'Stress': 'physicalFatigueScore',
    'MentalEffort': 'mentalFatigueScore'
}, inplace=True)

# 7Ô∏è‚É£ Create correct window_start times based on original participant timestamps
person_start_times = df.groupby('PP')['timestamp'].min().to_dict()
windowed_df['window_start'] = windowed_df.apply(
    lambda row: person_start_times[row['person']] + pd.to_timedelta(row['window_id'] * window_size, unit='s'),
    axis=1
)

# 8Ô∏è‚É£ Normalize features (z-score per participant)
mean_std_cols = [c for c in windowed_df.columns if '_mean' in c or '_std' in c]
for c in mean_std_cols:
    windowed_df[c] = windowed_df.groupby('person')[c].transform(
        lambda x: (x - x.mean()) / (x.std() + 1e-8)
    )

# 9Ô∏è‚É£ Final column order
final_cols = [
    'HR_mean', 'HR_std',
    'RMSSD_mean', 'RMSSD_std',
    'SCL_mean', 'SCL_std',
    'Svalence_mean', 'Svalence_std',
    'Sneutral_mean', 'Sneutral_std',
    'physicalFatigueScore', 'mentalFatigueScore',
    'window_start', 'person', 'session', 'Condition'
]
windowed_df = windowed_df[[c for c in final_cols if c in windowed_df.columns]]

# 10Ô∏è‚É£ Save
out_path = "/content/drive/MyDrive/0_SWELL/final_SWELL_like_main_dataset.csv"
windowed_df.to_csv(out_path, index=False)

print(f"‚úÖ Final dataset shape: {windowed_df.shape}")
print(f"‚úÖ Saved to: {out_path}")
display(windowed_df.head(10))


‚úÖ Final dataset shape: (747, 16)
‚úÖ Saved to: /content/drive/MyDrive/0_SWELL/final_SWELL_like_main_dataset.csv


Unnamed: 0,HR_mean,HR_std,RMSSD_mean,RMSSD_std,SCL_mean,SCL_std,Svalence_mean,Svalence_std,Sneutral_mean,Sneutral_std,physicalFatigueScore,mentalFatigueScore,window_start,person,session,Condition
0,1.554084,0.954296,-0.533203,-0.492759,0.19876,-0.421939,0.045837,-0.537653,0.078081,-0.447448,4.1,5.1,2012-09-18 13:26:00,PP1,1,N
1,1.119712,-0.417004,-0.553597,-0.645356,0.227152,-0.696671,0.228401,-0.070308,0.527549,-0.688766,4.1,5.1,2012-09-18 13:31:00,PP1,1,N
2,0.612946,-0.515881,-0.468567,-0.819217,0.127384,0.593101,-0.122596,-0.241674,1.255767,-0.777495,4.1,5.1,2012-09-18 13:36:00,PP1,1,N
3,1.264503,-1.010442,-0.760729,-0.699981,-0.510607,-0.993773,-0.344095,-0.516158,0.661296,-0.522002,4.1,5.1,2012-09-18 13:41:00,PP1,1,N
4,0.974922,-0.072931,-0.07816,1.385861,-0.740251,-0.555314,-0.973629,0.110691,0.830384,-1.197584,4.1,5.1,2012-09-18 13:46:00,PP1,1,N
5,1.91606,-0.2363,-0.040877,0.542928,-0.808217,-0.577468,-1.098963,-0.794195,1.183204,-0.664483,4.1,5.1,2012-09-18 13:51:00,PP1,1,N
6,0.974922,-0.393267,-0.0712,0.899333,-1.363271,-0.458949,0.867523,4.435025,-0.141507,1.279959,4.1,5.1,2012-09-18 13:56:00,PP1,1,N
7,0.685341,0.86458,-0.694874,-0.295207,-1.444497,-0.456043,-0.659019,-0.412684,1.320968,-0.254253,4.1,5.1,2012-09-18 14:01:00,PP1,1,N
8,-0.436784,-1.342792,0.661669,2.926949,-1.350057,-0.806051,-1.012305,-0.836289,1.304311,-0.292016,4.1,5.1,2012-09-18 14:06:00,PP1,1,N
9,-1.160736,-0.465618,-0.391776,-0.789989,2.494023,0.270273,0.13042,0.698839,0.96415,0.004384,,,2012-09-18 13:16:00,PP1,1,R


In [None]:
# 7Ô∏è‚É£.5 Handle NaNs before normalization

# Step 1: Print NaN summary
print("üß© Missing values summary before imputation:")
print(windowed_df.isna().sum())

# Step 2: For numerical features ‚Äî per person, fill using interpolation or mean
numeric_cols = [c for c in windowed_df.columns if any(k in c for k in ['_mean', '_std'])]

for col in numeric_cols:
    windowed_df[col] = windowed_df.groupby('person')[col].transform(
        lambda x: x.interpolate(method='linear', limit_direction='both')
    )
    # If still NaNs (e.g., all-NaN windows), replace with group mean
    windowed_df[col] = windowed_df.groupby('person')[col].transform(
        lambda x: x.fillna(x.mean())
    )

# Step 3: Handle NaNs in regression labels (replace with participant mean)
for lbl in ['physicalFatigueScore', 'mentalFatigueScore']:
    if lbl in windowed_df.columns:
        windowed_df[lbl] = windowed_df.groupby('person')[lbl].transform(
            lambda x: x.fillna(x.mean())
        )

# Step 4: Drop any remaining fully empty rows
windowed_df.dropna(inplace=True)

# Step 5: Confirm
print("\n‚úÖ Missing values after cleaning:")
print(windowed_df.isna().sum())

# 10Ô∏è‚É£ Save
out_path = "/content/drive/MyDrive/0_SWELL/final_SWELL_like_main_dataset.csv"
windowed_df.to_csv(out_path, index=False)

üß© Missing values summary before imputation:
HR_mean                 301
HR_std                  393
RMSSD_mean              301
RMSSD_std               393
SCL_mean                118
SCL_std                 196
Svalence_mean            34
Svalence_std            106
Sneutral_mean            34
Sneutral_std            106
physicalFatigueScore    150
mentalFatigueScore      150
window_start              0
person                    0
session                   0
Condition                 0
dtype: int64

‚úÖ Missing values after cleaning:
HR_mean                 0
HR_std                  0
RMSSD_mean              0
RMSSD_std               0
SCL_mean                0
SCL_std                 0
Svalence_mean           0
Svalence_std            0
Sneutral_mean           0
Sneutral_std            0
physicalFatigueScore    0
mentalFatigueScore      0
window_start            0
person                  0
session                 0
Condition               0
dtype: int64


In [None]:
# ‚úÖ Ensure window_start is datetime (for proper chronological sort)
windowed_df['window_start'] = pd.to_datetime(windowed_df['window_start'], errors='coerce')

# ‚úÖ Sort by person ‚Üí session ‚Üí window_start
windowed_df.sort_values(by=['person', 'session', 'window_start'], inplace=True)

# ‚úÖ Reset index after sorting
windowed_df.reset_index(drop=True, inplace=True)

# Optional check
print(windowed_df[['person', 'session', 'window_start']].head(10))


  person  session        window_start
0    PP1        1 2012-09-18 13:16:00
1    PP1        1 2012-09-18 13:21:00
2    PP1        1 2012-09-18 13:26:00
3    PP1        1 2012-09-18 13:31:00
4    PP1        1 2012-09-18 13:36:00
5    PP1        1 2012-09-18 13:41:00
6    PP1        1 2012-09-18 13:46:00
7    PP1        1 2012-09-18 13:51:00
8    PP1        1 2012-09-18 13:56:00
9    PP1        1 2012-09-18 14:01:00


THE FULL CODE TO PREPROCESS

In [None]:
import pandas as pd
import numpy as np

# 1Ô∏è‚É£ Load the SWELL dataset
file_path = "/content/drive/MyDrive/0_SWELL/Behavioral-features - per minute.xlsx"
df = pd.read_excel(file_path)

# 2Ô∏è‚É£ Select relevant columns
client_info = ['PP', 'Blok', 'Condition', 'timestamp']
physio_features = ['HR', 'RMSSD', 'SCL', 'Svalence', 'Sneutral']
regression_labels = ['Stress', 'MentalEffort']

df = df[client_info + physio_features + regression_labels].copy()

# 3Ô∏è‚É£ Parse timestamp (format: 20120918T131600000)
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%dT%H%M%S%f', errors='coerce')
df.dropna(subset=['timestamp'], inplace=True)

# Ensure correct order
df.sort_values(by=['PP', 'Blok', 'timestamp'], inplace=True)
df.reset_index(drop=True, inplace=True)

# 4Ô∏è‚É£ Create window index based on minute-level sampling
window_size = 5  # 5-minute non-overlapping windows
df['row_idx'] = df.groupby(['PP', 'Blok']).cumcount()
df['window_id'] = (df['row_idx'] // window_size).astype(int)

# 5Ô∏è‚É£ Aggregate by window (mean/std for features, mean for labels)
agg_dict = {}
for f in physio_features:
    agg_dict[f'{f}_mean'] = (f, 'mean')
    agg_dict[f'{f}_std'] = (f, 'std')
for lbl in regression_labels:
    agg_dict[lbl] = (lbl, 'mean')

windowed_df = (
    df.groupby(['PP', 'Blok', 'Condition', 'window_id'])
      .agg(**agg_dict)
      .reset_index()
)

# 6Ô∏è‚É£ Rename AFTER aggregation
windowed_df.rename(columns={
    'PP': 'person',
    'Blok': 'session',
    'Stress': 'physicalFatigueScore',
    'MentalEffort': 'mentalFatigueScore'
}, inplace=True)

# 7Ô∏è‚É£ Add actual window_start timestamp (first timestamp in each window)
first_timestamps = (
    df.groupby(['PP', 'Blok', 'Condition', 'window_id'])['timestamp']
      .min()
      .reset_index(name='window_start')
)

# üß© Match column names for merge
first_timestamps.rename(columns={'PP': 'person', 'Blok': 'session'}, inplace=True)

windowed_df = pd.merge(windowed_df, first_timestamps,
                       on=['person', 'session', 'Condition', 'window_id'], how='left')

# 7Ô∏è‚É£.5 Handle NaNs before normalization
print("üß© Missing values summary before imputation:")
print(windowed_df.isna().sum())

# Numeric feature columns (mean/std)
numeric_cols = [c for c in windowed_df.columns if any(k in c for k in ['_mean', '_std'])]

# Fill NaNs via interpolation + mean per person
for col in numeric_cols:
    windowed_df[col] = windowed_df.groupby('person')[col].transform(
        lambda x: x.interpolate(method='linear', limit_direction='both')
    )
    windowed_df[col] = windowed_df.groupby('person')[col].transform(
        lambda x: x.fillna(x.mean())
    )

# Handle NaNs in fatigue labels
for lbl in ['physicalFatigueScore', 'mentalFatigueScore']:
    if lbl in windowed_df.columns:
        windowed_df[lbl] = windowed_df.groupby('person')[lbl].transform(
            lambda x: x.fillna(x.mean())
        )

# Drop any remaining NaNs
windowed_df.dropna(inplace=True)

print("\n‚úÖ Missing values after cleaning:")
print(windowed_df.isna().sum())

# 8Ô∏è‚É£ Normalize (z-score) per participant
mean_std_cols = [c for c in windowed_df.columns if '_mean' in c or '_std' in c]
for c in mean_std_cols:
    windowed_df[c] = windowed_df.groupby('person')[c].transform(
        lambda x: (x - x.mean()) / (x.std() + 1e-8)
    )

# 9Ô∏è‚É£ Ensure datetime & sort correctly
windowed_df['window_start'] = pd.to_datetime(windowed_df['window_start'], errors='coerce')
windowed_df.sort_values(by=['person', 'session', 'window_start'], inplace=True)
windowed_df.reset_index(drop=True, inplace=True)

# üîü Final column order
final_cols = [
    'HR_mean', 'HR_std',
    'RMSSD_mean', 'RMSSD_std',
    'SCL_mean', 'SCL_std',
    'Svalence_mean', 'Svalence_std',
    'Sneutral_mean', 'Sneutral_std',
    'physicalFatigueScore', 'mentalFatigueScore',
    'window_start', 'person', 'session', 'Condition'
]
windowed_df = windowed_df[[c for c in final_cols if c in windowed_df.columns]]

# 11Ô∏è‚É£ Save
out_path = "/content/drive/MyDrive/0_SWELL/final_SWELL_like_main_dataset.csv"
windowed_df.to_csv(out_path, index=False)

print(f"\n‚úÖ Final dataset shape: {windowed_df.shape}")
print(f"‚úÖ Saved to: {out_path}")

# 12Ô∏è‚É£ Quick check
display(windowed_df.head(10))


üß© Missing values summary before imputation:
person                    0
session                   0
Condition                 0
window_id                 0
HR_mean                 295
HR_std                  383
RMSSD_mean              295
RMSSD_std               383
SCL_mean                111
SCL_std                 196
Svalence_mean            35
Svalence_std            108
Sneutral_mean            35
Sneutral_std            108
physicalFatigueScore    150
mentalFatigueScore      150
window_start              0
dtype: int64

‚úÖ Missing values after cleaning:
person                  0
session                 0
Condition               0
window_id               0
HR_mean                 0
HR_std                  0
RMSSD_mean              0
RMSSD_std               0
SCL_mean                0
SCL_std                 0
Svalence_mean           0
Svalence_std            0
Sneutral_mean           0
Sneutral_std            0
physicalFatigueScore    0
mentalFatigueScore      0
window_start

Unnamed: 0,HR_mean,HR_std,RMSSD_mean,RMSSD_std,SCL_mean,SCL_std,Svalence_mean,Svalence_std,Sneutral_mean,Sneutral_std,physicalFatigueScore,mentalFatigueScore,window_start,person,session,Condition
0,-1.096258,-0.460621,-0.330304,-0.783858,2.489092,0.254887,0.200775,0.575069,0.737636,0.133664,4.325,6.9875,2012-09-18 13:16:00,PP1,1,R
1,0.203041,-0.547164,-0.85574,-0.493482,2.323966,1.17039,1.492926,-0.100654,-0.278226,0.847954,4.325,6.9875,2012-09-18 13:21:00,PP1,1,R
2,1.595147,1.85944,-0.496009,-0.455184,0.231712,-0.43766,-0.002658,-0.537403,0.428042,-1.2033,4.1,5.1,2012-09-18 13:26:00,PP1,1,N
3,1.242481,-0.398754,-0.5538,-0.66081,0.180068,-0.659972,0.555226,-0.675704,-0.302684,-0.702023,4.1,5.1,2012-09-18 13:30:00,PP1,1,N
4,0.797007,-0.088575,-0.492768,-0.750968,0.252525,-0.103103,-0.240623,-0.20842,1.202266,-0.633446,4.1,5.1,2012-09-18 13:35:00,PP1,1,N
5,1.390972,-1.15398,-0.76083,-0.605476,-0.495265,-0.974953,-0.236892,-0.614361,0.410948,-0.455448,4.1,5.1,2012-09-18 13:40:00,PP1,1,N
6,1.242481,0.136559,-0.180618,1.198356,-0.649625,-0.746862,-0.850971,-0.176548,0.627853,-1.109,4.1,5.1,2012-09-18 13:45:00,PP1,1,N
7,1.780762,0.067948,0.270457,0.178594,-0.817521,-0.607493,-1.323774,-0.659495,1.207815,-1.16801,4.1,5.1,2012-09-18 13:50:00,PP1,1,N
8,1.038305,-0.101744,0.406196,0.672538,-1.274616,-0.091629,0.82291,4.37789,-0.392129,1.444878,4.1,5.1,2012-09-18 13:55:00,PP1,1,N
9,1.093989,0.883856,-0.963255,-0.787567,-1.433418,-0.510675,-0.474687,-0.865763,0.852469,-0.081253,4.1,5.1,2012-09-18 14:00:00,PP1,1,N


Types in Client information

In [None]:
import pandas as pd

# Load your file
df = pd.read_csv("/content/drive/MyDrive/0_SWELL/final_SWELL_like_main_dataset.csv")

# Show unique persons, sessions, and conditions
print("üßç Unique Persons:")
print(df["person"].unique())

print("\nüß© Unique Sessions:")
print(df["session"].unique())

print("\nüéØ Unique Conditions:")
print(df["Condition"].unique())


üßç Unique Persons:
['PP1' 'PP10' 'PP12' 'PP13' 'PP14' 'PP15' 'PP16' 'PP17' 'PP18' 'PP19'
 'PP2' 'PP20' 'PP21' 'PP22' 'PP23' 'PP24' 'PP25' 'PP3' 'PP4' 'PP5' 'PP6'
 'PP7' 'PP9']

üß© Unique Sessions:
[1 2 3]

üéØ Unique Conditions:
['R' 'N' 'T' 'I']
