In [None]:
import os, h5py, re, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

plt.rcParams["figure.figsize"] = (8, 4)
plt.rcParams["axes.grid"] = False

In [None]:
# ✏️ Update this to the file you want to analyze
H5_PATH = Path("../data/hdf5_data_final/t15.2023.08.13/data_train.hdf5")
assert H5_PATH.exists(), f"Not found: {H5_PATH}"
H5_PATH

In [None]:
rows = []
with h5py.File(H5_PATH, "r") as f:
    for trial_key in f.keys():
        g = f[trial_key]
    
        input_features = g['input_features'][:]
        n_time_steps = g.attrs['n_time_steps']
        seq_class_ids = g['seq_class_ids'][:] if 'seq_class_ids' in g else None
        seq_len = g.attrs['seq_len'] if 'seq_len' in g.attrs else None
        transcription = g['transcription'][:] if 'transcription' in g else None
        sentence_label = g.attrs['sentence_label'][:] if 'sentence_label' in g.attrs else None
        session = g.attrs['session']
        block_num = g.attrs['block_num']
        trial_num = g.attrs['trial_num']

        rows.append({
            "trial_key": trial_key,
            "session": session,
            "block_num": block_num,
            "trial_num": trial_num,
            "n_time_steps": n_time_steps,
            # "seq_class_ids": seq_class_ids, - byte array
            "seq_len": seq_len,
            # "transcription": transcription, - byte array
            "sentence_label": sentence_label,
            "word_count": len(sentence_label.split())
        })

df_trials = pd.DataFrame(rows)
df_trials.head()

In [None]:
# Create additional columns
df_trials["duration_sec"] = df_trials["n_time_steps"] * 0.02
df_trials["char_count"] = df_trials["sentence_label"].str.len()

In [None]:
# Explore distributions

import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15,4))

axes[0].hist(df_trials["duration_sec"], bins=40)
axes[0].set_title("Trial duration (seconds)")

axes[1].hist(df_trials["word_count"], bins=30)
axes[1].set_title("Sentence word counts")

axes[2].hist(df_trials["char_count"], bins=30)
axes[2].set_title("Sentence character counts")

plt.tight_layout()
plt.show()

In [None]:
# Relationships between inputs and outputs
plt.figure(figsize=(6,4))
plt.scatter(df_trials["word_count"], df_trials["duration_sec"], alpha=0.5, s=10)
plt.xlabel("Word count")
plt.ylabel("Trial duration (seconds)")
plt.title("Trial duration vs sentence length")
plt.show()

In [None]:
# Shortest / longest by duration
print("Shortest trials:")
display(df_trials.nsmallest(5, "duration_sec")[["trial_key","duration_sec","sentence_label"]])

print("\nLongest trials:")
display(df_trials.nlargest(5, "duration_sec")[["trial_key","duration_sec","sentence_label"]])

# Shortest / longest by word count
print("Fewest words:")
display(df_trials.nsmallest(5, "word_count")[["trial_key","word_count","sentence_label"]])

print("\nMost words:")
display(df_trials.nlargest(5, "word_count")[["trial_key","word_count","sentence_label"]])

# Shortest / longest by character count
print("\nFewest characters:")
display(df_trials.nsmallest(5, "char_count")[["trial_key","char_count","sentence_label"]])

print("\nMost characters:")
display(df_trials.nlargest(5, "char_count")[["trial_key","char_count","sentence_label"]])


In [None]:
# Sentence Content Correlation
df_trials[["n_time_steps","word_count","char_count"]].corr()

In [None]:
df_trials.groupby("block_num")["duration_sec"].mean().plot(kind="bar", figsize=(8,4))
plt.ylabel("Avg trial duration (s)")
plt.title("Average trial length by block")
plt.show()