# Generate Experiment Stimuli

This notebook generates the experimental design grid, samples condition-balanced stimuli per participant, and validates the distribution of conditions through summary plots.

In [20]:
# --- Imports ------------------------------------------------------------------

from itertools import product

import numpy as np
import pandas as pd


## Generate all possible conditions

In [53]:
anchors = [25, 50]
nearAnchors = [21, 22, 23, 24, 26, 27, 28, 29, 46, 47, 48, 49]
startPositions = [0, 25, 50, 75]
endPositions = [25, 50, 75, 100]

values = []
alignmentCategories = []
alignmentDistances = []
alignmentTypes = []
valueAlignments = []
anchorCategories = []
anchorDistances = []
round5Distances = []
positions = []

for selectedPart in range(15, 56):
    for startPosition in range(0, 100):
        endPosition = startPosition + selectedPart # calculate end position

        # remove impossible positions
        if endPosition > 100:
            continue

        # calculate minimum distance from alignment for both ends
        startDists = [startPosition, abs(startPosition - 25), abs(startPosition - 50), abs(startPosition - 75)]
        endDists = [abs(endPosition - 25), abs(endPosition - 50), abs(endPosition - 75), abs(endPosition - 100)]
        
        startAlignment = np.min(startDists)
        endAlignment = np.min(endDists)

        # determine which end is closer to alignment
        if startAlignment < endAlignment:
            alignmentSide = "start"
            alignmentPosition = startPositions[np.argmin(startDists)]
        elif startAlignment > endAlignment:
            alignmentSide = "end"
            alignmentPosition = endPositions[np.argmin(endDists)]
        else:
            alignmentSide = "both ends"
            startAligned = np.argmin(startDists)
            endAligned = np.argmin(endDists)
            if startAligned < endAligned:
                alignmentPosition = f"{startPositions[startAligned]} and {endPositions[endAligned]}"
            else:
                alignmentPosition = f"{startPositions[startAligned]} and {endPositions[endAligned]}"

        # determine the alignment category
        alignmentDistance = min(startAlignment, endAlignment)
        if alignmentDistance == 0:
            alignmentCategory = "aligned"
        elif alignmentDistance < 5:
            alignmentCategory = "near-align"
        else: 
            alignmentCategory = "far-align"

        # determine anchor category
        anchorDistance = min(abs(selectedPart - 25), abs(selectedPart - 50))
        if selectedPart in anchors:
            anchorCategory = "anchor"
        elif selectedPart in nearAnchors:
            anchorCategory = "near-anchor"
        else:
            anchorCategory = "far-anchor"

        # determine distance to the nearest 5
        round5Distance = abs(selectedPart - round(selectedPart / 5) * 5)

        # describe every possible selectedPart/alignment condition in the range
        valueAlignment = f"Value {selectedPart} with {alignmentSide} {alignmentDistance} off alignment with {alignmentPosition}."
        if valueAlignment not in valueAlignments:
            valueAlignments.append(valueAlignment)
            values.append(selectedPart)
            alignmentCategories.append(alignmentCategory)
            anchorCategories.append(anchorCategory)
            alignmentDistances.append(alignmentDistance)
            anchorDistances.append(anchorDistance)
            round5Distances.append(round5Distance)
            alignmentTypes.append(alignmentPosition)
            positions.append(startPosition)

allConditions = pd.DataFrame({
    # "Description": valueAlignments,
    "selectedPart": values,
    "alignmentCategory": alignmentCategories,
    "anchorCategory": anchorCategories,
    "anchorDistance": anchorDistances,
    "alignmentDistance": alignmentDistances,
    "round5Distance": round5Distances,
    "alignmentType": alignmentTypes,
    "alignmentPosition": positions
})
allConditions

Unnamed: 0,selectedPart,alignmentCategory,anchorCategory,anchorDistance,alignmentDistance,round5Distance,alignmentType,alignmentPosition
0,15,aligned,far-anchor,10,0,0,0,0
1,15,near-align,far-anchor,10,1,0,0,1
2,15,near-align,far-anchor,10,2,0,0,2
3,15,near-align,far-anchor,10,3,0,0,3
4,15,near-align,far-anchor,10,4,0,0,4
...,...,...,...,...,...,...,...,...
2297,55,near-align,far-anchor,5,4,0,100,41
2298,55,near-align,far-anchor,5,3,0,100,42
2299,55,near-align,far-anchor,5,2,0,100,43
2300,55,near-align,far-anchor,5,1,0,100,44


In [66]:
from collections import defaultdict

anchorCounts = {
    "anchor": 12,
    "near-anchor": 36,
    "far-anchor": 48,
}

alignmentCounts = {
    "aligned": 24,
    "near-align": 36,
    "far-align": 36,
}

chartTypes = ["line", "pie"]
participantIDs = range(1, 61)
rng = np.random.default_rng(123)

participantStimuli = []

for pid in participantIDs:
    participant_df = []

    for anchor in anchorCounts.keys():
        total = anchorCounts[anchor]
        per_chart = total // 2

        # split per chart into alignment categories
        n_aligned = int(np.ceil(per_chart * 0.25))
        n_remaining = per_chart - n_aligned
        n_near = n_remaining // 2
        n_far = per_chart - n_aligned - n_near

        alignment_alloc = {
            "aligned": n_aligned,
            "near-align": n_near,
            "far-align": n_far,
        }

        for chart in chartTypes:
            for align, n_to_sample in alignment_alloc.items():
                if anchor == "anchor":
                    n_each = n_to_sample // 2
                    leftover = n_to_sample - 2 * n_each
                    give_extra_to_25 = (pid % 2 == 1)

                    for anchor_value in [25, 50]:
                        n_this = n_each + (1 if (anchor_value == 25) == give_extra_to_25 and leftover > 0 else 0)

                        pool = allConditions[
                            (allConditions["anchorCategory"] == anchor)
                            & (allConditions["alignmentCategory"] == align)
                            & (allConditions["selectedPart"] == anchor_value)
                        ]

                        if len(pool) < n_this:
                            print(f"Warning: Not enough samples for PID {pid}, {anchor}, {align}, anchor {anchor_value}")

                        sampled = pool.sample(n=n_this, replace=True, random_state=rng.integers(1e9))
                        sampled = sampled.assign(chartType=chart)
                        participant_df.append(sampled)

                else:
                    pool = allConditions[
                        (allConditions["anchorCategory"] == anchor)
                        & (allConditions["alignmentCategory"] == align)
                    ]
                    if len(pool) < n_to_sample:
                        print(f"Warning: Not enough samples for PID {pid}, {anchor}, {align}")

                    sampled = pool.sample(n=n_to_sample, replace=True, random_state=rng.integers(1e9))
                    sampled = sampled.assign(chartType=chart)
                    participant_df.append(sampled)

    df_pid = pd.concat(participant_df, ignore_index=True)
    df_pid["UID"] = pid
    participantStimuli.append(df_pid)

stimuli = pd.concat(participantStimuli, ignore_index=True)
stimuli["selectedLabel"] = rng.choice(list("ABCDEFG"), size=len(stimuli), replace=True)

In [67]:
labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

def samplePartition(n, target, rng):
    """Generate n positive integers summing to target from beta distribution."""
    for _ in range(100):  # retry if invalid
        raw = rng.beta(a=2, b=10, size=n)
        scaled = np.round(raw / raw.sum() * target).astype(int)
        diff = target - scaled.sum()
        if diff != 0:
            scaled[0] += diff
        if np.all(scaled > 0):
            return scaled
    raise ValueError("Failed to generate a valid partition.")

def generateParts(row, rng):
  selectedPart = int(row["selectedPart"])
  selectedLabel = row["selectedLabel"]
  alignmentPos = int(row["alignmentPosition"])

  otherLabels = [l for l in labels if l != selectedLabel]
  remainingSpace = 100 - selectedPart
  leftSpace = alignmentPos
  rightSpace = 100 - selectedPart - alignmentPos

  # Determine how many go left vs right
  leftCount = round(6 * (leftSpace / (leftSpace + rightSpace)))
  rightCount = 6 - leftCount

  # Adjust for edge cases
  if leftCount == 0 and leftSpace > 0:
    leftCount = 1
    rightCount = 5
  if rightCount == 0 and rightSpace > 0:
    rightCount = 1
    leftCount = 5

  # Sample values for each side
  leftVals = samplePartition(leftCount, leftSpace, rng) if leftCount > 0 else []
  rightVals = samplePartition(rightCount, rightSpace, rng) if rightCount > 0 else []

  # Combine full part values in left → selected → right order
  partValues = list(leftVals) + [selectedPart] + list(rightVals)

  # Assign labels and indices
  shuffledLabels = rng.permutation(otherLabels)
  leftLabels = shuffledLabels[:leftCount]
  rightLabels = shuffledLabels[leftCount:leftCount + rightCount]
  orderedLabels = list(leftLabels) + [selectedLabel] + list(rightLabels)

  partDict = {lab: val for lab, val in zip(orderedLabels, partValues)}
  indexDict = {f"{lab}_ind": i for i, lab in enumerate(orderedLabels)}

  # Fill in missing labels with NaN
  for lab in labels:
      partDict.setdefault(lab, np.nan)
      indexDict.setdefault(f"{lab}_ind", np.nan)

  return {**partDict, **indexDict}

parts = pd.DataFrame([generateParts(row, rng) for _, row in stimuli.iterrows()])
stimuli = pd.concat([stimuli.reset_index(drop=True), parts], axis=1)
stimuli

Unnamed: 0,selectedPart,alignmentCategory,anchorCategory,anchorDistance,alignmentDistance,round5Distance,alignmentType,alignmentPosition,chartType,UID,...,D,A,C,G_ind,B_ind,E_ind,F_ind,D_ind,A_ind,C_ind
0,25,aligned,anchor,0,0,0,0 and 25,0,line,1,...,12,4,11,0,1,2,3,4,5,6
1,50,aligned,anchor,0,0,0,25 and 75,25,line,1,...,50,12,7,6,2,1,4,3,0,5
2,25,near-align,anchor,0,4,0,0 and 25,4,line,1,...,4,4,25,6,2,3,4,5,0,1
3,50,near-align,anchor,0,2,0,0 and 50,2,line,1,...,6,2,5,4,1,3,6,2,0,5
4,25,far-align,anchor,0,7,0,75 and 100,68,line,1,...,4,24,7,5,2,1,0,4,3,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5755,30,far-align,far-anchor,5,5,0,25,30,pie,60,...,8,11,18,2,0,3,6,4,1,5
5756,33,far-align,far-anchor,8,5,2,50,55,pie,60,...,6,7,33,0,2,6,3,4,1,5
5757,33,far-align,far-anchor,8,5,2,75,37,pie,60,...,8,2,13,4,6,3,2,1,5,0
5758,55,far-align,far-anchor,5,6,0,100,39,pie,60,...,55,12,9,0,3,2,6,5,1,4


In [68]:
def checkAlignment(row):
    selectedLabel = row["selectedLabel"]
    selectedIndex = row[f"{selectedLabel}_ind"]
    alignmentPos = row["alignmentPosition"]
    
    values = np.array([row[label] for label in labels])
    indices = np.array([row[f"{label}_ind"] for label in labels])
    
    precedingSum = values[indices < selectedIndex].sum()
    return precedingSum == alignmentPos

# Apply to all rows
alignmentValid = stimuli.apply(checkAlignment, axis=1)
print("Invalid alignments:", (~alignmentValid).sum())

Invalid alignments: 0


In [69]:
stimuli = (stimuli.groupby("UID").apply(lambda df: df.sample(frac=1)))
stimuli.to_csv("data/stimuli.csv", index=False)

  stimuli = (stimuli.groupby("UID").apply(lambda df: df.sample(frac=1)))
