In [29]:
from pathlib import Path
import re
import pandas as pd

root = Path("/Users/yusufmoola/Library/CloudStorage/OneDrive-UHN/videos_renamed")

# Example: idapt801_sub344_DP_12_GP1_12-54-18.MP4
pattern = re.compile(
    r"^(?P<shoe_id>idapt\d+)_"
    r"(?P<participant>sub\d+)_"
    r"(?P<dir_result>[A-Z]{2})_"
    r"(?P<angle>\d+(?:\.\d+)?)_"
    r"(?:(?P<group>GP\d+)_)?"
    r"(?P<time>\d{2}-\d{2}-\d{2})\.MP4$",
    re.IGNORECASE,
)

dir_result_map = {
    "UP": ("Up", "Pass"),
    "UF": ("Up", "Fail"),
    "DP": ("Down", "Pass"),
    "DF": ("Down", "Fail"),
    "UU": ("Up", "Undecided"),
    "DU": ("Down", "Undecided")
}

rows = []
for fp in root.rglob("*.MP4"):
    m = pattern.match(fp.name)
    if not m:
        rows.append({
            "file": str(fp),
            "shoe_id": None,
            "participant": None,
            "direction": None,
            "result": None,
            "angle": None,
            "group": None,
            "time": None,
            "parsed_ok": False,
        })
        continue

    d = m.groupdict()
    code = d["dir_result"].upper()
    direction, result = dir_result_map.get(code, (None, None))
    angle_val = float(d["angle"])
    angle_val = int(round(angle_val))

    rows.append({
        "file": str(fp),
        "shoe_id": d["shoe_id"].lower(),
        "participant": d["participant"].lower(),
        "direction": direction,
        "result": result,
        "angle": angle_val,
        "group": d["group"],  # e.g., GP1
        "time": d["time"],
        "parsed_ok": True,
    })

df = pd.DataFrame(rows)

df = df.drop(columns=["group"], errors="ignore")

print("Total MP4 files:", len(df))
print("Parsed OK:", df["parsed_ok"].sum())
print("Failed parse:", (~df["parsed_ok"]).sum())

# Basic overview
print("Total files:", len(df))
print("Parsed OK:", df["parsed_ok"].sum())
print("Failed parse:", (~df["parsed_ok"]).sum())

# Missingness
print("\nMissing values:")
display(df.isna().sum())

# Uniques
print("\nUnique participants:", df["participant"].nunique(dropna=True))
print("Unique shoes:", df["shoe_id"].nunique(dropna=True))
print("Unique angles:", df["angle"].nunique(dropna=True))

# Distributions
print("\nParticipant counts:")
display(df["participant"].value_counts(dropna=True))

print("\nShoe counts:")
display(df["shoe_id"].value_counts(dropna=True))

print("\nDirection counts:")
display(df["direction"].value_counts(dropna=True))

print("\nResult counts:")
display(df["result"].value_counts(dropna=True))

print("\nAngle counts:")
display(df["angle"].value_counts(dropna=True).sort_index())


# Cross-tabs
print("\nParticipant x Shoe:")
display(pd.crosstab(df["participant"], df["shoe_id"]))

print("\nDirection x Result:")
display(pd.crosstab(df["direction"], df["result"]))

print("\nAngle x Result:")
display(pd.crosstab(df["angle"], df["result"]))



Total MP4 files: 964
Parsed OK: 964
Failed parse: 0
Total files: 964
Parsed OK: 964
Failed parse: 0

Missing values:


file           0
shoe_id        0
participant    0
direction      0
result         0
angle          0
time           0
parsed_ok      0
dtype: int64


Unique participants: 13
Unique shoes: 12
Unique angles: 13

Participant counts:


participant
sub344    184
sub349    165
sub353     73
sub295     68
sub357     63
sub360     59
sub364     59
sub373     58
sub367     54
sub368     48
sub346     47
sub354     45
sub352     41
Name: count, dtype: int64


Shoe counts:


shoe_id
idapt797    132
idapt799    129
idapt796    114
idapt803    110
idapt800     92
idapt802     91
idapt798     75
idapt801     68
idapt805     51
idapt804     44
idapt817     31
idapt816     27
Name: count, dtype: int64


Direction counts:


direction
Up      493
Down    471
Name: count, dtype: int64


Result counts:


result
Pass    724
Fail    240
Name: count, dtype: int64


Angle counts:


angle
0      72
3      76
5      77
6       2
7      82
8      20
9     101
10     44
11    135
12    113
13    131
14     55
15     56
Name: count, dtype: int64


Participant x Shoe:


shoe_id,idapt796,idapt797,idapt798,idapt799,idapt800,idapt801,idapt802,idapt803,idapt804,idapt805,idapt816,idapt817
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
sub295,21,24,0,23,0,0,0,0,0,0,0,0
sub344,50,57,0,53,0,24,0,0,0,0,0,0
sub346,0,0,0,0,0,0,18,29,0,0,0,0
sub349,43,51,0,53,0,18,0,0,0,0,0,0
sub352,0,0,0,0,0,0,20,21,0,0,0,0
sub353,0,0,0,0,0,26,0,0,20,27,0,0
sub354,0,0,19,0,26,0,0,0,0,0,0,0
sub357,0,0,34,0,29,0,0,0,0,0,0,0
sub360,0,0,22,0,37,0,0,0,0,0,0,0
sub364,0,0,0,0,0,0,31,28,0,0,0,0



Direction x Result:


result,Fail,Pass
direction,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,137,334
Up,103,390



Angle x Result:


result,Fail,Pass
angle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,72
3,0,76
5,2,75
6,0,2
7,1,81
8,4,16
9,11,90
10,15,29
11,27,108
12,57,56


In [30]:
import pandas as pd

report_path = "summary_stats.xlsx"

with pd.ExcelWriter(report_path) as writer:
    df.describe(include="all").to_excel(writer, sheet_name="describe")
    df["participant"].value_counts(dropna=True).to_excel(writer, sheet_name="participants")
    df["shoe_id"].value_counts(dropna=True).to_excel(writer, sheet_name="shoes")
    df["direction"].value_counts(dropna=True).to_excel(writer, sheet_name="direction")
    df["result"].value_counts(dropna=True).to_excel(writer, sheet_name="result")
    df["angle"].value_counts(dropna=True).sort_index().to_excel(writer, sheet_name="angle")
    pd.crosstab(df["participant"], df["shoe_id"]).to_excel(writer, sheet_name="participant_x_shoe")
    pd.crosstab(df["direction"], df["result"]).to_excel(writer, sheet_name="direction_x_result")
    pd.crosstab(df["angle"], df["result"]).to_excel(writer, sheet_name="angle_x_result")

print("Wrote:", report_path)


Wrote: summary_stats.xlsx


In [3]:
from pathlib import Path

first_root = Path("/Users/yusufmoola/Desktop/videos_new_360p")
second_root = Path("/Users/yusufmoola/Library/CloudStorage/OneDrive-UHN/Li, Yue (Sophia)'s files - new video and tipper files/Videos")

def collect_index(root: Path):
    index = {}
    for date_dir in root.iterdir():
        if not date_dir.is_dir():
            continue
        for participant_dir in date_dir.iterdir():
            if not participant_dir.is_dir():
                continue
            videos = sorted(
                f.name
                for f in participant_dir.iterdir()
                if f.is_file() and f.suffix.lower() == ".mp4"
            )
            index[(date_dir.name, participant_dir.name)] = videos
    return index

def compare_indexes(idx_a, idx_b):
    keys_a, keys_b = set(idx_a), set(idx_b)
    only_a = sorted(keys_a - keys_b)
    only_b = sorted(keys_b - keys_a)

    missing = {}
    for key in sorted(keys_a & keys_b):
        a_list, b_list = idx_a[key], idx_b[key]
        a_set, b_set = set(a_list), set(b_list)
        missing[key] = {
            "missing_from_first": sorted(b_set - a_set),
            "missing_from_second": sorted(a_set - b_set),
            "count_first": len(a_list),
            "count_second": len(b_list),
        }
    return only_a, only_b, missing

idx_first = collect_index(first_root)
idx_second = collect_index(second_root)
only_first, only_second, missing = compare_indexes(idx_first, idx_second)

print("Subfolders only in first:", len(only_first))
for date, participant in only_first:
    print(f"  {date}/{participant}")

print("\nSubfolders only in second:", len(only_second))
for date, participant in only_second:
    print(f"  {date}/{participant}")

print("\nCommon subfolders with differences (by date/participant):")
for key, info in missing.items():
    if info["missing_from_first"] or info["missing_from_second"] or info["count_first"] != info["count_second"]:
        date, participant = key
        print(f"- {date}/{participant}: first={info['count_first']} second={info['count_second']}")
        if info["missing_from_first"]:
            print(f"    Missing in first: {info['missing_from_first']}")
        if info["missing_from_second"]:
            print(f"    Missing in second: {info['missing_from_second']}")


Subfolders only in first: 0

Subfolders only in second: 0

Common subfolders with differences (by date/participant):
