In [None]:
import re
from string import punctuation
from pathlib import Path

DATA_PATH = Path("data")

def clean_text(text: str) -> str:
    """Lowercase and remove punctuation."""
    text = text.lower()
    text = re.sub(f"[{re.escape(punctuation)}]", "", text)
    return text

def process_conversation(conv_text: str):
    """
    Process conversation text into two numeric speaker streams for CRQA.
    Dynamically detects the first two speakers and assigns word IDs.
    """
    lines = [l.strip() for l in conv_text.split("\n") if l.strip()]
    if not lines:
        return {}

    # Identify speakers dynamically
    speakers = []
    word_map = {}
    next_id = 1
    streams = {}

    # These will be populated later once we know speaker order
    speaker1, speaker2 = None, None
    s1_words, s2_words = [], []

    for line in lines:
        if ":" not in line:
            continue

        speaker, text = line.split(":", 1)
        speaker = speaker.strip()
        text = clean_text(text)
        words = text.split()

        # Register speakers in order of first appearance
        if speaker not in speakers:
            speakers.append(speaker)
        if len(speakers) > 2:
            raise ValueError(f"More than two speakers found: {speakers}")

        if len(speakers) == 2:
            speaker1, speaker2 = speakers

        # Assign numeric IDs to new words
        encoded = []
        for w in words:
            if w not in word_map:
                word_map[w] = next_id
                next_id += 1
            encoded.append(word_map[w])

        # Build parallel numeric sequences
        if speaker == speakers[0]:
            s1_words.extend(encoded)
            s2_words.extend([None] * len(encoded))  # placeholder for listener
        elif len(speakers) > 1 and speaker == speakers[1]:
            s1_words.extend([None] * len(encoded))
            s2_words.extend(encoded)

    # Replace None with sentinel values for CRQA (-1/-2)
    s1_filled = [x if x is not None else -1 for x in s1_words]
    s2_filled = [x if x is not None else -2 for x in s2_words]

    # Maintain original speaker names
    return {
        speakers[0]: s1_filled,
        speakers[1]: s2_filled,
    }

# --- Main nested data structure builder ---
all_conversations = {}

if DATA_PATH.exists():
    for condition_dir in sorted([d for d in DATA_PATH.iterdir() if d.is_dir()]):
        condition_name = condition_dir.name
        all_conversations[condition_name] = {}

        for txt_file in sorted(condition_dir.glob("*.txt")):
            text = txt_file.read_text(encoding="utf-8", errors="replace")
            speaker_series = process_conversation(text)
            all_conversations[condition_name][txt_file.stem] = speaker_series

# --- Example summary output ---
for condition, convs in all_conversations.items():
    print(f"\nCondition: {condition}")
    for conv_name, data in convs.items():
        print(f"  Conversation: {conv_name}")
        for speaker, series in data.items():
            print(f"    {speaker}: {len(series)} words (example: {series[:10]})")


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

def compute_crqa_metrics(s1, s2, min_diag_len=2):
    """Compute Recurrence Rate and Determinism between two numeric sequences."""
    s1 = np.asarray(s1)
    s2 = np.asarray(s2)

    if s1.size == 0 or s2.size == 0:
        return np.nan, np.nan

    R = (s1[:, None] == s2[None, :]).astype(int)
    total_points = R.size
    recurrent_points = R.sum()
    rr = recurrent_points / total_points if total_points > 0 else np.nan

    # Determinism: proportion of recurrence points that form diagonals >= min_diag_len
    det_points = 0
    for k in range(-len(s1) + 1, len(s2)):
        diag = np.diagonal(R, offset=k)
        count = 0
        for v in diag:
            if v == 1:
                count += 1
            else:
                if count >= min_diag_len:
                    det_points += count
                count = 0
        if count >= min_diag_len:
            det_points += count

    det = det_points / recurrent_points if recurrent_points > 0 else np.nan
    return float(rr), float(det)


# ---- Main CRQA loop ----
records = []

for condition, convs in all_conversations.items():
    for conv_name, data in convs.items():
        speakers = list(data.keys())
        if len(speakers) != 2:
            continue
        s1 = np.array(data[speakers[0]])
        s2 = np.array(data[speakers[1]])

        rr, det = compute_crqa_metrics(s1, s2)
        records.append({
            "Condition": condition,
            "Conversation": conv_name,
            "Speaker1": speakers[0],
            "Speaker2": speakers[1],
            "RR": rr,
            "DET": det
        })
        


<class 'list'>


In [None]:

for entry in records:
    
    print(f"{entry["Condition"]}, {entry["RR"]}, {entry["DET"]}")


chatgpt5_chats, 0.00113862462048895, 0.07840616966580977
chatgpt5_chats, 0.0012968146371765078, 0.09351145038167939
chatgpt5_chats, 0.0013714948616375812, 0.1225296442687747
chatgpt5_chats, 0.001549296748320315, 0.14328210213187903
chatgpt5_chats, 0.001168065232698824, 0.07410491257285595
chatgpt5_chats, 0.0014134137897830702, 0.08840227088402271
chatgpt5_chats, 0.001707348952717054, 0.06539074960127592
chatgpt5_chats, 0.0012803605093512605, 0.04963235294117647
chatgpt5_chats, 0.0012048192771084338, 0.0855421686746988
claude_haiku-45_chats, 0.0010421910371570804, 0.0846286701208981
claude_haiku-45_chats, 0.001580609095490048, 0.12391033623910336
claude_haiku-45_chats, 0.0016128352167313206, 0.07770515613652869
claude_haiku-45_chats, 0.0013077498710413321, 0.10617283950617284
claude_haiku-45_chats, 0.00168307320957956, 0.12804878048780488
claude_haiku-45_chats, 0.0010557726861645969, 0.07830188679245283
claude_haiku-45_chats, 0.001499236975518421, 0.11177347242921014
claude_haiku-45_cha

In [29]:
import csv

output_file = "conversation_crqa_data.csv"

# Open file for writing
with open(output_file, mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)

    # Write header row
    writer.writerow(["Condition", "Conversation", "Speaker1", "Speaker2", "RR", "DET"])

    # Write each record as a new line
    for entry in records:
        writer.writerow([
            entry["Condition"],
            entry["Conversation"],
            entry["Speaker1"],
            entry["Speaker2"],
            entry["RR"],
            entry["DET"]
        ])

print(f"✅ CSV file written to: {output_file}")

✅ CSV file written to: conversation_crqa_data.csv
