In [2]:
import pandas as pd
from pathlib import Path

# === Settings ===
input_path = Path(
    r"Z:\UriMons\EPA\New_EPA\Experiment\Behavior\Grooming_Annotations\Manual_Grooming_Classification.xlsx"
)

# Output file name
output_path = input_path.with_name(
    input_path.stem + "_withMergedBouts_andSummary" + input_path.suffix
)

# Column definitions
FRAMES_COL_NAME = "G frames"   # column D, bout segment length in frames
BOUT_NUMBER_COL_IDX = 0        # column A, bout number (0-based index)
TIME_COL_IDX = 5               # column F, bout time (0-based: A=0,...,F=5)
INTERVAL_COL_IDX = 6           # column G, interval between this bout and the next

# If column F is already in minutes, keep 1.0. If it is in seconds, use 1/60.0
TIME_UNIT_SCALE = 1.0  # assuming F is in minutes


TIME_BINS = [
    (0, 10, "0–10 min"),
    (10, 20, "10–20 min"),
    (20, 30, "20–30 min"),
]


def classify_bout(total_frames):
    """Classify merged bout as short or long based on total frames."""
    try:
        f = float(total_frames)
    except (TypeError, ValueError):
        return pd.NA

    if f < 90:
        return "Short grooming bout"
    else:
        return "Long Grooming Bout"


def assign_time_bin(t_minutes):
    """Assign a bout start time (in minutes) to a time bin."""
    if pd.isna(t_minutes):
        return pd.NA
    try:
        t = float(t_minutes)
    except (TypeError, ValueError):
        return pd.NA

    for start, end, label in TIME_BINS:
        if start <= t < end:
            return label
    return ">30 min"


# === Load all sheets ===
sheets = pd.read_excel(input_path, sheet_name=None)

with pd.ExcelWriter(output_path) as writer:
    for sheet_name, df in sheets.items():
        # Basic checks
        if FRAMES_COL_NAME not in df.columns:
            raise ValueError(
                f'Column "{FRAMES_COL_NAME}" not found in sheet "{sheet_name}"!'
            )

        # Extract raw columns
        bout_num_col_name = df.columns[BOUT_NUMBER_COL_IDX]
        time_series_raw = df.iloc[:, TIME_COL_IDX]
        interval_series = df.iloc[:, INTERVAL_COL_IDX]

        # Time in minutes
        time_minutes = pd.to_numeric(time_series_raw, errors="coerce") * TIME_UNIT_SCALE
        df["Time_min"] = time_minutes

        # Group rows into merged bouts based on intervals in column G
        # Interval in row i is between bout i and i+1.
        # If interval <= 90, same merged bout; if >= 91, new merged bout.
        n = len(df)
        bout_group_ids = [None] * n
        if n > 0:
            current_group = 1
            bout_group_ids[0] = current_group

            for i in range(1, n):
                prev_interval = interval_series.iloc[i - 1]
                try:
                    prev_int_val = float(prev_interval)
                except (TypeError, ValueError):
                    prev_int_val = float("inf")  # treat invalid as large gap

                if prev_int_val <= 90:
                    # Same merged bout as previous row
                    bout_group_ids[i] = current_group
                else:
                    # New merged bout
                    current_group += 1
                    bout_group_ids[i] = current_group

        df["MergedBoutID"] = bout_group_ids

        # Aggregate to merged-bout level
        grouped = df.groupby("MergedBoutID", dropna=True)

        merged_bouts = grouped.agg(
            BoutNumberStart=(bout_num_col_name, "min"),
            BoutNumberEnd=(bout_num_col_name, "max"),
            TotalFrames=(FRAMES_COL_NAME, "sum"),
            StartTime_min=("Time_min", "min"),
            EndTime_min=("Time_min", "max"),
        ).reset_index(drop=True)

        # Classify merged bouts
        merged_bouts["BoutType"] = merged_bouts["TotalFrames"].apply(classify_bout)

        # Assign time bin based on start time
        merged_bouts["TimeBin"] = merged_bouts["StartTime_min"].apply(assign_time_bin)

        # Add a simple BoutID for readability
        merged_bouts.insert(0, "BoutID", range(1, len(merged_bouts) + 1))

        # === Build summary table based on merged bouts ===
        valid = merged_bouts.dropna(subset=["BoutType", "TimeBin"])

        summary = (
            valid.groupby(["TimeBin", "BoutType"])
            .size()
            .unstack(fill_value=0)
            .sort_index()
        )

        # Ensure both columns exist
        for col in ["Short grooming bout", "Long Grooming Bout"]:
            if col not in summary.columns:
                summary[col] = 0

        summary["Total bouts"] = (
            summary["Short grooming bout"] + summary["Long Grooming Bout"]
        )

        # Add overall totals row
        overall = pd.DataFrame(
            {
                "Short grooming bout": [summary["Short grooming bout"].sum()],
                "Long Grooming Bout": [summary["Long Grooming Bout"].sum()],
                "Total bouts": [summary["Total bouts"].sum()],
            },
            index=["All"],
        )
        summary = pd.concat([summary, overall])

        # === Write to Excel ===
        # 1. Merged bouts table (per mouse) at the top of the sheet
        merged_bouts.to_excel(writer, sheet_name=sheet_name, index=False)

        # 2. Summary table starting at column K
        summary.to_excel(
            writer,
            sheet_name=sheet_name,
            startrow=0,
            startcol=10,  # column K
        )

print(f"Done! Merged bouts + summary saved to:\n{output_path}")


Done! Merged bouts + summary saved to:
Z:\UriMons\EPA\New_EPA\Experiment\Behavior\Grooming_Annotations\Manual_Grooming_Classification_withMergedBouts_andSummary.xlsx
