# MultiplEYE preprocessing

In [1]:
from pathlib import Path

data_dir = Path("../data/pilot-hr-1-zh")
stim_dir = Path("../data/stimuli_MultiplEYE_HR_CH_Zurich_1_2025")
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

## EDF to ASC

Use the `edf2asc` binary from the [EyeLink Developers Kit](https://www.sr-research.com/support/thread-13.html) to convert EDF files to ASC files.

To discuss:
- We probably can't distribute the binary due to licensing issues. (But we might be able to distribute a Docker image?)
- There is already an [issue](https://github.com/aeye-lab/pymovements/issues/509) to integrate this into `pymovements`.
- The `-input` option is unnecessary, but currently required by `parse_eyelink()` in `pymovements`.
- The `-ftime` option would make handling 2kHz ASC files easier, but `pymovements` doesn't support floating point timestamps yet (https://github.com/aeye-lab/pymovements/pull/739).

In [2]:
import subprocess

edf = data_dir / "ch1hr007.edf"
subprocess.run(["./edf2asc", edf, "-input", "-ftime", "-p", output_dir, "-y"])


EDF2ASC: EyeLink EDF file -> ASCII (text) file translator
EDF2ASC version 4.2.762.0 Linux   standalone Jul 20 2023 
(c)1995-2023 by SR Research, last modified Jul 20 2023

processing file ../data/pilot-hr-1-zh/ch1hr007.edf 
loadEvents = 1
| DATE: Wed Jun 19 17:49:30 2024                                              |
| TYPE: EDF_FILE BINARY EVENT SAMPLE TAGGED                                   |
| VERSION: EYELINK II 1                                                       |
| SOURCE: EYELINK CL                                                          |
| EYELINK II CL v6.14 Mar  6 2020 (EyeLink Portable Duo)                      |
| CAMERA: EyeLink USBCAM Version 1.01                                         |
| SERIAL NUMBER: CLU-DBE08                                                    |
| CAMERA_CONFIG: DBE08200.SCD                                                 |
| RECORDED BY libeyelink.py                                                   |

missing sample at 819712.500000 
missin

CompletedProcess(args=['./edf2asc', PosixPath('../data/pilot-hr-1-zh/ch1hr007.edf'), '-input', '-ftime', '-p', PosixPath('output'), '-y'], returncode=255)

## ASC to sample-level CSV

Convert the ASC files to CSV files (one for each page) where each row is a sample.

### Parse ASC file

In [3]:
import csv

import polars as pl
import pymovements as pm

asc = output_dir / "ch1hr007.asc"

data = pm.gaze.from_asc(
    asc,
    patterns=[
        r"start_recording_(?P<trial>(?:PRACTICE_)?trial_\d+)_(?P<screen>.+)",
        {"pattern": r"stop_recording_", "column": "trial", "value": None},
        {"pattern": r"stop_recording_", "column": "screen", "value": None},
        {
            "pattern": r"start_recording_(?:PRACTICE_)?trial_\d+_page_\d+",
            "column": "activity",
            "value": "reading",
        },
        {
            "pattern": r"start_recording_(?:PRACTICE_)?trial_\d+_question_\d+",
            "column": "activity",
            "value": "question",
        },
        {
            "pattern": r"start_recording_(?:PRACTICE_)?trial_\d+_(familiarity_rating_screen_\d+|subject_difficulty_screen)",
            "column": "activity",
            "value": "rating",
        },
        {"pattern": r"stop_recording_", "column": "activity", "value": None},
        {
            "pattern": r"start_recording_PRACTICE_trial_",
            "column": "practice",
            "value": True,
        },
        {
            "pattern": r"start_recording_trial_",
            "column": "practice",
            "value": False,
        },
        {"pattern": r"stop_recording_", "column": "practice", "value": None},
    ],
)
data.frame

  from .autonotebook import tqdm as notebook_tqdm


time,pupil,trial,screen,activity,practice,pixel
f64,f64,str,str,str,bool,list[f64]
709376.0,206.0,,,,,"[102.2, 101.4]"
709376.5,210.0,,,,,"[104.1, 99.8]"
709377.0,206.0,,,,,"[99.1, 106.5]"
709377.5,204.0,,,,,"[85.2, 103.3]"
709378.0,202.0,,,,,"[98.2, 106.7]"
709378.5,209.0,,,,,"[100.2, 91.7]"
709379.0,203.0,,,,,"[97.9, 96.4]"
709379.5,206.0,,,,,"[109.3, 103.4]"
709380.0,203.0,,,,,"[93.6, 105.4]"
709380.5,204.0,,,,,"[94.6, 103.7]"


### Map trial numbers to stimulus IDs

In [4]:
stimulus_ids = {}
with open(data_dir / "logfiles" / "completed_stimuli.csv") as f:
    reader = csv.DictReader(f)
    for i, row in enumerate(reader):
        stimulus_ids[f"trial_{i + 1}"] = row["stimulus_id"]

df = data.frame.with_columns(
    pl.col("trial").replace(stimulus_ids).alias("stimulus_id")
)
df

time,pupil,trial,screen,activity,practice,pixel,stimulus_id
f64,f64,str,str,str,bool,list[f64],str
709376.0,206.0,,,,,"[102.2, 101.4]",
709376.5,210.0,,,,,"[104.1, 99.8]",
709377.0,206.0,,,,,"[99.1, 106.5]",
709377.5,204.0,,,,,"[85.2, 103.3]",
709378.0,202.0,,,,,"[98.2, 106.7]",
709378.5,209.0,,,,,"[100.2, 91.7]",
709379.0,203.0,,,,,"[97.9, 96.4]",
709379.5,206.0,,,,,"[109.3, 103.4]",
709380.0,203.0,,,,,"[93.6, 105.4]",
709380.5,204.0,,,,,"[94.6, 103.7]",


### Write separate CSVs for each page

Convert [x, y] pixel column to separate pixel_x and piyel_y columns. This is necessary because polars does not support nested values when exporting CSV.

In [5]:
df = df.select(
    [
        pl.all().exclude("pixel"),
        pl.col("pixel").list.get(0).alias("pixel_x"),
        pl.col("pixel").list.get(1).alias("pixel_y"),
    ]
)
df

time,pupil,trial,screen,activity,practice,stimulus_id,pixel_x,pixel_y
f64,f64,str,str,str,bool,str,f64,f64
709376.0,206.0,,,,,,102.2,101.4
709376.5,210.0,,,,,,104.1,99.8
709377.0,206.0,,,,,,99.1,106.5
709377.5,204.0,,,,,,85.2,103.3
709378.0,202.0,,,,,,98.2,106.7
709378.5,209.0,,,,,,100.2,91.7
709379.0,203.0,,,,,,97.9,96.4
709379.5,206.0,,,,,,109.3,103.4
709380.0,203.0,,,,,,93.6,105.4
709380.5,204.0,,,,,,94.6,103.7


Split data into CSV files.

In [6]:
raw_dir = output_dir / "raw"
raw_dir.mkdir(exist_ok=True, parents=True)

stimulus_practice = df[["stimulus_id", "practice"]].unique()
for stimulus_id, practice in stimulus_practice.iter_rows():
    if stimulus_id is not None:
        screen_df = df.filter((pl.col("stimulus_id") == stimulus_id))
        screen_df = screen_df.select([
            pl.col("time"),
            pl.col("screen"),
            pl.col("pixel_x"),
            pl.col("pixel_y"),
            pl.col("pupil"),
        ])
        if practice:
            screen_df.write_csv(raw_dir / f"007__{stimulus_id}.csv")
        else:
            screen_df.write_csv(raw_dir / f"007__stimulus_{stimulus_id}.csv")

## ⬇️ Everything from this point on would be part of the published preprocessing pipeline ⬇️

## Dataset definition

In [7]:
from dataclasses import dataclass, field
import pymovements as pm


@dataclass
class Multipleye(pm.DatasetDefinition):
    name: str = "Multipleye"

    # TODO: Read this from a metadata file
    experiment: pm.Experiment = pm.Experiment(
        sampling_rate=2000,
        screen_width_px=1275,
        screen_height_px=916,
        screen_width_cm=37,
        screen_height_cm=28,
        distance_cm=60,
    )

    filename_format: str = r"{subject_id:d}__{stimulus_id}.csv"

    filename_format_dtypes = {
        "subject_id": int,
        "stimulus_id": str,
        "screen": str,
    }

    trial_columns: list[str] = field(default_factory=lambda: ["stimulus_id", "screen"])

    time_column: str = "time"

    time_unit: str = "ms"

    pixel_columns: list[str] = field(default_factory=lambda: ["pixel_x", "pixel_y"])


dataset = pm.Dataset(Multipleye, "output")
dataset.load()

Multipleye(name='Multipleye', mirrors=(), resources=(), experiment=<pymovements.gaze.experiment.Experiment object at 0x7fbb790d2b50>, filename_format='{subject_id:d}__{stimulus_id}.csv', filename_format_dtypes={}, custom_read_kwargs={}, column_map={}, trial_columns=['stimulus_id', 'screen'], time_column='time', time_unit='ms', pixel_columns=['pixel_x', 'pixel_y'], position_columns=None, velocity_columns=None, acceleration_columns=None, distance_column=None)
{subject_id:d}__{stimulus_id}.csv
(?P<subject_id>[0-9]+)__(?P<stimulus_id>.+).csv
re.compile('(?P<subject_id>[0-9]+)__(?P<stimulus_id>.+).csv')
(?P<subject_id>[0-9]+)__(?P<stimulus_id>.+).csv
[{'subject_id': '007', 'stimulus_id': 'PRACTICE_trial_1', 'filepath': '007__PRACTICE_trial_1.csv'}, {'subject_id': '007', 'stimulus_id': 'PRACTICE_trial_2', 'filepath': '007__PRACTICE_trial_2.csv'}, {'subject_id': '007', 'stimulus_id': 'stimulus_1', 'filepath': '007__stimulus_1.csv'}, {'subject_id': '007', 'stimulus_id': 'stimulus_10', 'filepat

100%|██████████| 12/12 [00:01<00:00,  6.60it/s]


<pymovements.dataset.dataset.Dataset at 0x7fbbc1903510>

In [8]:
dataset.gaze

[Experiment(sampling_rate=2000, screen=Screen(width_px=1275, height_px=916, width_cm=37, height_cm=28, distance_cm=60, origin=upper left), eyetracker=None)
 shape: (99_803, 6)
 ┌──────────┬────────────┬───────┬────────────┬──────────────────┬───────────────────┐
 │ time     ┆ screen     ┆ pupil ┆ subject_id ┆ stimulus_id      ┆ pixel             │
 │ ---      ┆ ---        ┆ ---   ┆ ---        ┆ ---              ┆ ---               │
 │ f64      ┆ str        ┆ f64   ┆ str        ┆ str              ┆ list[str]         │
 ╞══════════╪════════════╪═══════╪════════════╪══════════════════╪═══════════════════╡
 │ 715945.0 ┆ page_1     ┆ 0.0   ┆ 007        ┆ PRACTICE_trial_1 ┆ [null, null]      │
 │ 715945.5 ┆ page_1     ┆ 0.0   ┆ 007        ┆ PRACTICE_trial_1 ┆ [null, null]      │
 │ 715946.0 ┆ page_1     ┆ 0.0   ┆ 007        ┆ PRACTICE_trial_1 ┆ [null, null]      │
 │ 715946.5 ┆ page_1     ┆ 0.0   ┆ 007        ┆ PRACTICE_trial_1 ┆ [null, null]      │
 │ …        ┆ …          ┆ …     ┆ …     