In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import glob
from preproc_helpers import *

In [2]:
# Constants (try changing to find best ones)
SEGMENT_LENGTH = 900
GAP_INTERPOLATION_LIMIT = 6
LONG_GAP_THRESHOLD = 7
MAX_SPEED_THRESHOLD = 10.0

In [3]:
#loading RAW data
ROOT = os.getcwd()
DATA_DIR = os.path.join(ROOT, "new_test")
control_dir = os.path.join(DATA_DIR, "noTerbinafine")
treated_dir = os.path.join(DATA_DIR, "Terbinafine")

# summary file

#get list of all individual files (take all CSVs)
control_files = glob.glob(os.path.join(control_dir, "*.csv"))
treated_files = glob.glob(os.path.join(treated_dir, "*.csv"))

#get files, adding a condition column and a column with the file name (to group later?)
control_dfs = []
for file in control_files:
    df = pd.read_csv(file)
    df["condition"] = "control"
    df["source_file"] = os.path.basename(file)
    control_dfs.append(df)

treated_dfs = []
for file in treated_files:
    df = pd.read_csv(file)
    df["condition"] = "terbinafine"
    df["source_file"] = os.path.basename(file)
    treated_dfs.append(df)



treated_dfs[0].head()

Unnamed: 0,GlobalFrame,Timestamp,Speed,Fragment,LocalFrame,X,Y,condition,source_file
0,1,2025-05-29T09:23:24.002461,,2,0,567.753799,544.069909,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...
1,2,2025-05-26T14:53:26.008723,,2,1,567.767372,544.416918,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...
2,3,2025-05-26T14:53:28.002073,7.875247,2,2,569.303303,544.765766,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...
3,4,2025-05-26T14:53:30.002368,0.36474,2,3,569.376119,544.770149,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...
4,5,2025-05-26T14:53:32.002478,0.664785,2,4,569.293413,544.874252,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...


In [6]:

# Local Frame and fragment can be ignored (from Alices_explanation)
control_dfs_ = [df.drop(columns=["Fragment", "LocalFrame"]) for df in control_dfs]
treated_dfs_ = [df.drop(columns=["Fragment", "LocalFrame"]) for df in treated_dfs]
# remove very first row (useless and aligns for segments)
control_dfs_1 = [df.iloc[1:].reset_index(drop=True) for df in control_dfs_]
treated_dfs_1 = [df.iloc[1:].reset_index(drop=True) for df in treated_dfs_]
#cap extreme speeds
control_dfs_cap = [cap_extreme_speeds(df) for df in control_dfs_1]
treated_dfs_cap = [cap_extreme_speeds(df) for df in treated_dfs_1]


print(treated_dfs[0].shape)
print(treated_dfs_cap[0].shape)
treated_dfs_cap[0].head()


(55801, 9)
(55800, 7)


Unnamed: 0,GlobalFrame,Timestamp,Speed,X,Y,condition,source_file
0,2,2025-05-26T14:53:26.008723,,567.767372,544.416918,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...
1,3,2025-05-26T14:53:28.002073,7.875247,569.303303,544.765766,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...
2,4,2025-05-26T14:53:30.002368,0.36474,569.376119,544.770149,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...
3,5,2025-05-26T14:53:32.002478,0.664785,569.293413,544.874252,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...
4,6,2025-05-26T14:53:34.002446,1.977771,568.934911,544.707101,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...


In [7]:
#create segments
segments_ctrl = [split_into_segments(df) for df in control_dfs_cap]
segments_treat = [split_into_segments(df) for df in treated_dfs_cap]
# en th√©orie c'est des list pour la condition, dedans liste par worm qui contient un df pour chaque segment

In [8]:
# clean segments 
cleaned_segments_ctrl = []
for worm_ctrl in segments_ctrl:
    cleaned_segs_ctrl = []
    for seg_ctrl in worm_ctrl :
        seg_ctrl_cleaned = clean_segment_gaps(seg_ctrl)
        seg_ctrl_cleaned = calculate_turning_angle(seg_ctrl_cleaned)
        seg_ctrl_cleaned = seg_ctrl_cleaned.drop(columns = ['Segment']) # remove a segment column (no need for 2 of them)
        seg_ctrl_cleaned = fill_nans_with_next_value(seg_ctrl_cleaned)
        cleaned_segs_ctrl.append(seg_ctrl_cleaned)
    cleaned_segments_ctrl.append(cleaned_segs_ctrl)


cleaned_segments_treat = []
for worm_treat in segments_treat:
    cleaned_segs_treat = []
    for seg_treat in worm_treat :
        seg_treat_cleaned = clean_segment_gaps(seg_treat)
        seg_treat_cleaned = calculate_turning_angle(seg_treat_cleaned)
        seg_treat_cleaned = seg_treat_cleaned.drop(columns = ['Segment']) # remove a segment column (no need for 2 of them)
        seg_treat_cleaned = fill_nans_with_next_value(seg_treat_cleaned)
        cleaned_segs_treat.append(seg_treat_cleaned)
    cleaned_segments_treat.append(cleaned_segs_treat)


  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().

In [9]:
def count_nans_in_df(df, cols=["X", "Y", "Speed", "turning_angle"]):
    # Only count NaNs in the columns that exist (avoid KeyErrors)
    if df is None:
        return 0
    cols_present = [c for c in cols if c in df.columns]
    return df[cols_present].isna().sum().sum()


total_nans_ctrl = 0
for worm in cleaned_segments_ctrl:
    for seg in worm:
        total_nans_ctrl += count_nans_in_df(seg)

print("Total NaNs in cleaned control segments:", total_nans_ctrl)

total_nans_treat = 0
for worm in cleaned_segments_treat:
    for seg in worm:
        total_nans_treat += count_nans_in_df(seg)

print("Total NaNs in cleaned treated segments:", total_nans_treat)


Total NaNs in cleaned control segments: 0
Total NaNs in cleaned treated segments: 0


In [10]:
#verification
cleaned_segments_treat[0][0].tail()

Unnamed: 0,GlobalFrame,Timestamp,Speed,X,Y,condition,source_file,Segment_index,turning_angle
895,897,2025-05-26T15:23:16.002524,0.639544,577.77551,550.897959,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...,0,-109.724823
896,898,2025-05-26T15:23:18.002316,2.110154,578.142293,551.106719,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...,0,-52.273208
897,899,2025-05-26T15:23:20.002478,0.909563,578.310204,551.036735,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...,0,-27.93477
898,900,2025-05-26T15:23:22.002428,4.073362,578.827731,550.407563,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...,0,117.897043
899,901,2025-05-26T15:23:24.002440,3.839165,579.123595,551.116105,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...,0,0.0


Maintenant (en th√©orie) les segments sont clean, il faut tout reconcat pour chaque worm et normaliser par rapport √† la trajectoire enti√®re du worm. Les trajectoires full peuvent √™tre enregistr√©es, on resepare en segments et on enregistre les segments.

In [13]:
def assign_worm_id(df, worm_id):
    """
    Assign a fixed worm_id to an entire DataFrame.
    """
    df = df.copy()
    df["worm_id"] = worm_id
    return df



In [20]:
# concatenate the segments for each worm, normalize and add worm_id
n_ctrl = len(cleaned_segments_ctrl)

control_dfs_final = []
for worm_id, worm in enumerate(cleaned_segments_ctrl):
    worm_norm = normalize_trajectory_data(pd.concat(worm, ignore_index=True))
    worm_norm_id = assign_worm_id(worm_norm, worm_id)
    control_dfs_final.append(worm_norm_id)

treat_dfs_final = []
for worm_id, worm in enumerate(cleaned_segments_treat):
    worm_norm = normalize_trajectory_data(pd.concat(worm, ignore_index=True))
    worm_norm_id = assign_worm_id(worm_norm, worm_id + n_ctrl) #offset
    treat_dfs_final.append(worm_norm_id)


In [21]:
treat_dfs_final[1].head()

Unnamed: 0,GlobalFrame,Timestamp,Speed,X,Y,condition,source_file,Segment_index,turning_angle,worm_id
0,2.0,2025-05-26T14:53:26.008723,2.338125,0.840036,0.689034,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...,0.0,0.0,11
1,3.0,2025-05-26T14:53:28.002073,2.338125,0.84213,0.687405,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...,0.0,0.360937,11
2,4.0,2025-05-26T14:53:30.002368,-0.404561,0.842539,0.687614,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...,0.0,-0.097267,11
3,5.0,2025-05-26T14:53:32.002478,-0.525424,0.842897,0.687675,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...,0.0,0.668874,11
4,6.0,2025-05-26T14:53:34.002446,-0.924303,0.842869,0.687709,terbinafine,E.Gencturk_20250526_1453_N2-lifespan-FUDR+Terb...,0.0,0.913683,11


In [22]:
# save full trajectories

output_dir = "test_data/full"
os.makedirs(output_dir, exist_ok=True)

for df in control_dfs_final:
    original = df["source_file"].iloc[0]
    base = os.path.splitext(original)[0]    
    new_name = f"{base}-preprocessed.csv"
    save_worm_csv(df, new_name, output_dir)

for df in treat_dfs_final:
    original = df["source_file"].iloc[0]
    base = os.path.splitext(original)[0]    
    new_name = f"{base}-preprocessed.csv"
    save_worm_csv(df, new_name, output_dir)


In [23]:
# save all segments (need to reseparate them)

segments_dir = "test_data/segments"
os.makedirs(segments_dir, exist_ok=True)

segments_ctrl_final = [split_into_segments(df) for df in control_dfs_final]
segments_treat_final = [split_into_segments(df) for df in treat_dfs_final]

for worm in segments_ctrl_final:
    for df in worm:
        # Skip empty segment
        if df.empty:
            continue

        original = df["source_file"].iloc[0]
        base = os.path.splitext(original)[0] 
        index = df['Segment_index'].iloc[0]
        new_name = f"{base}-fragment{index}-preprocessed.csv"
        save_worm_csv(df, new_name, segments_dir)

for worm in segments_treat_final:
    for df in worm:
        # Skip empty segment
        if df.empty:
            continue

        original = df["source_file"].iloc[0]
        base = os.path.splitext(original)[0] 
        index = df['Segment_index'].iloc[0]
        new_name = f"{base}-fragment{index}-preprocessed.csv"
        save_worm_csv(df, new_name, segments_dir)