# Preprocessing

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import glob
from preproc_helpers import *

In [2]:
# Constants 
SEGMENT_LENGTH = 900
GAP_INTERPOLATION_LIMIT = 6
LONG_GAP_THRESHOLD = 7
MAX_SPEED_THRESHOLD = 10.0

In [3]:
#loading RAW data
ROOT = os.getcwd()
DATA_DIR = os.path.join(ROOT, "TERBINAFINE")
control_dir = os.path.join(DATA_DIR, "TERBINAFINE- (control)")
treated_dir = os.path.join(DATA_DIR, "TERBINAFINE+")

# summary file
lifespan_df = pd.read_csv(os.path.join(DATA_DIR, "lifespan_summary.csv"))
#get list of all individual files (take all CSVs)
control_files = glob.glob(os.path.join(control_dir, "*.csv"))
treated_files = glob.glob(os.path.join(treated_dir, "*.csv"))

#get files, adding a condition column and a column with the file name (to group later?)
control_dfs = []
for file in control_files:
    df = pd.read_csv(file)
    df["condition"] = "control"
    df["source_file"] = os.path.basename(file)
    control_dfs.append(df)

treated_dfs = []
for file in treated_files:
    df = pd.read_csv(file)
    df["condition"] = "terbinafine"
    df["source_file"] = os.path.basename(file)
    treated_dfs.append(df)


lifespan_df.head()
treated_dfs[0].head()

Unnamed: 0,GlobalFrame,Timestamp,Speed,Fragment,LocalFrame,X,Y,condition,source_file
0,1,,,fragment_0,0,437.288703,379.317992,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...
1,2,2025-03-18T14:41:54.008410,,fragment_0,1,437.791111,379.808889,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...
2,3,2025-03-18T14:41:56.000871,0.333696,fragment_0,2,437.957547,379.79717,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...
3,4,2025-03-18T14:41:58.001852,0.337566,fragment_0,3,438.087379,379.68932,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...
4,5,2025-03-18T14:42:00.001812,0.454892,fragment_0,4,438.313131,379.661616,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...


In [4]:
#test 
test_df = trim_after_death(treated_dfs[0], lifespan_df)
print(treated_dfs[0].shape)
print(test_df.shape)

(75600, 9)
(75301, 9)


In [4]:

# Local Frame and fragment can be ignored (from Alices_explanation)
control_dfs_ = [df.drop(columns=["Fragment", "LocalFrame"]) for df in control_dfs]
treated_dfs_ = [df.drop(columns=["Fragment", "LocalFrame"]) for df in treated_dfs]
# remove very first row (useless and aligns for segments)
control_dfs_1 = [df.iloc[1:].reset_index(drop=True) for df in control_dfs_]
treated_dfs_1 = [df.iloc[1:].reset_index(drop=True) for df in treated_dfs_]
#cap extreme speeds
control_dfs_cap = [cap_extreme_speeds(df) for df in control_dfs_1]
treated_dfs_cap = [cap_extreme_speeds(df) for df in treated_dfs_1]
# remove all data after death
control_dfs_death = [trim_after_death(df, lifespan_df) for df in control_dfs_cap]
treated_dfs_death = [trim_after_death(df, lifespan_df) for df in treated_dfs_cap]

print(treated_dfs[0].shape)
print(treated_dfs_1[0].shape)
print(treated_dfs_death[0].shape)
treated_dfs_death[0].head()


(75600, 9)
(75599, 7)
(75300, 7)


Unnamed: 0,GlobalFrame,Timestamp,Speed,X,Y,condition,source_file
0,2,2025-03-18T14:41:54.008410,,437.791111,379.808889,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...
1,3,2025-03-18T14:41:56.000871,0.333696,437.957547,379.79717,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...
2,4,2025-03-18T14:41:58.001852,0.337566,438.087379,379.68932,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...
3,5,2025-03-18T14:42:00.001812,0.454892,438.313131,379.661616,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...
4,6,2025-03-18T14:42:02.001772,0.335773,438.19697,379.782828,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...


In [6]:
#create segments test

test_segs = split_into_segments(treated_dfs_death[0]) #
test_segs[0].tail()


Unnamed: 0,GlobalFrame,Timestamp,Speed,X,Y,condition,source_file,Segment,Segment_index
895,897,2025-03-18T15:11:44.001859,0.0,422.549669,380.635762,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0,0
896,898,2025-03-18T15:11:46.001905,0.413905,422.756579,380.631579,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0,0
897,899,2025-03-18T15:11:48.001797,0.0,422.756579,380.631579,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0,0
898,900,2025-03-18T15:11:50.001756,0.250432,422.870968,380.580645,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0,0
899,901,2025-03-18T15:11:52.001948,0.411985,422.687075,380.673469,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0,0


In [5]:
#create segments
segments_ctrl = [split_into_segments(df) for df in control_dfs_death]
segments_treat = [split_into_segments(df) for df in treated_dfs_death]


In [6]:
# clean segments 
cleaned_segments_ctrl = []
for worm_ctrl in segments_ctrl:
    cleaned_segs_ctrl = []
    for seg_ctrl in worm_ctrl :
        seg_ctrl_cleaned = clean_segment_gaps(seg_ctrl)
        seg_ctrl_cleaned = calculate_turning_angle(seg_ctrl_cleaned)
        seg_ctrl_cleaned = seg_ctrl_cleaned.drop(columns = ['Segment']) # remove a segment column (no need for 2 of them)
        seg_ctrl_cleaned = fill_nans_with_next_value(seg_ctrl_cleaned)
        cleaned_segs_ctrl.append(seg_ctrl_cleaned)
    cleaned_segments_ctrl.append(cleaned_segs_ctrl)


cleaned_segments_treat = []
for worm_treat in segments_treat:
    cleaned_segs_treat = []
    for seg_treat in worm_treat :
        seg_treat_cleaned = clean_segment_gaps(seg_treat)
        seg_treat_cleaned = calculate_turning_angle(seg_treat_cleaned)
        seg_treat_cleaned = seg_treat_cleaned.drop(columns = ['Segment']) # remove a segment column (no need for 2 of them)
        seg_treat_cleaned = fill_nans_with_next_value(seg_treat_cleaned)
        cleaned_segs_treat.append(seg_treat_cleaned)
    cleaned_segments_treat.append(cleaned_segs_treat)


  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()
  cleaned = cleaned.bfill().ffill()


In [8]:
def count_nans_in_df(df, cols=["X", "Y", "Speed", "turning_angle"]):
    # Only count NaNs in the columns that exist (avoid KeyErrors)
    if df is None:
        return 0
    cols_present = [c for c in cols if c in df.columns]
    return df[cols_present].isna().sum().sum()


total_nans_ctrl = 0
for worm in cleaned_segments_ctrl:
    for seg in worm:
        total_nans_ctrl += count_nans_in_df(seg)

print("Total NaNs in cleaned control segments:", total_nans_ctrl)

total_nans_treat = 0
for worm in cleaned_segments_treat:
    for seg in worm:
        total_nans_treat += count_nans_in_df(seg)

print("Total NaNs in cleaned treated segments:", total_nans_treat)


Total NaNs in cleaned control segments: 0
Total NaNs in cleaned treated segments: 0


In [9]:
#verification
cleaned_segments_treat[0][0].tail()

Unnamed: 0,GlobalFrame,Timestamp,Speed,X,Y,condition,source_file,Segment_index,turning_angle
895,897,2025-03-18T15:11:44.001859,0.0,422.549669,380.635762,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0,-1.158064
896,898,2025-03-18T15:11:46.001905,0.413905,422.756579,380.631579,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0,1.158064
897,899,2025-03-18T15:11:48.001797,0.0,422.756579,380.631579,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0,-24.001928
898,900,2025-03-18T15:11:50.001756,0.250432,422.870968,380.580645,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0,177.218505
899,901,2025-03-18T15:11:52.001948,0.411985,422.687075,380.673469,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0,0.0


Maintenant (en théorie) les segments sont clean, il faut tout reconcat pour chaque worm et normaliser par rapport à la trajectoire entière du worm. Les trajectoires full peuvent être enregistrées, on resepare en segments et on enregistre les segments.

In [10]:
# concatenate the segments for each worm, normalize and add worm_id
control_dfs_final = []
for worm in cleaned_segments_ctrl :
    worm_norm = normalize_trajectory_data(pd.concat(worm, ignore_index=True))
    worm_norm_id = rename_worm_id(worm_norm)
    control_dfs_final.append(worm_norm_id)

treat_dfs_final = []
for worm in cleaned_segments_treat :
    worm_norm = normalize_trajectory_data(pd.concat(worm, ignore_index=True))
    worm_norm_id = rename_worm_id(worm_norm)
    treat_dfs_final.append(worm_norm_id)

In [10]:
treat_dfs_final[0].head()

Unnamed: 0,GlobalFrame,Timestamp,Speed,X,Y,condition,source_file,Segment_index,turning_angle,worm_id
0,2.0,2025-03-18T14:41:54.008410,-0.54573,0.584501,0.507088,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0.0,0.0,20250318_piworm09_3
1,3.0,2025-03-18T14:41:56.000871,-0.54573,0.584723,0.507072,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0.0,-0.198269,20250318_piworm09_3
2,4.0,2025-03-18T14:41:58.001852,-0.543994,0.584896,0.506928,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0.0,0.181776,20250318_piworm09_3
3,5.0,2025-03-18T14:42:00.001812,-0.491361,0.585198,0.506891,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0.0,0.782097,20250318_piworm09_3
4,6.0,2025-03-18T14:42:02.001772,-0.544798,0.585043,0.507053,terbinafine,coordinates_highestspeed_20250318_9_3_with_tim...,0.0,-0.598984,20250318_piworm09_3


In [11]:
# save full trajectories

output_dir = "preprocessed_data/full"
os.makedirs(output_dir, exist_ok=True)

for df in control_dfs_final:
    original = df["source_file"].iloc[0]
    base = os.path.splitext(original)[0]    
    new_name = f"{base}-preprocessed.csv"
    save_worm_csv(df, new_name, output_dir)

for df in treat_dfs_final:
    original = df["source_file"].iloc[0]
    base = os.path.splitext(original)[0]    
    new_name = f"{base}-preprocessed.csv"
    save_worm_csv(df, new_name, output_dir)


In [12]:
# save all segments (need to reseparate them)

segments_dir = "preprocessed_data/segments"
os.makedirs(segments_dir, exist_ok=True)

segments_ctrl_final = [split_into_segments(df) for df in control_dfs_final]
segments_treat_final = [split_into_segments(df) for df in treat_dfs_final]

for worm in segments_ctrl_final:
    for df in worm:
        # Skip empty segment
        if df.empty:
            continue

        original = df["source_file"].iloc[0]
        base = os.path.splitext(original)[0] 
        index = df['Segment_index'].iloc[0]
        new_name = f"{base}-fragment{index}-preprocessed.csv"
        save_worm_csv(df, new_name, segments_dir)

for worm in segments_treat_final:
    for df in worm:
        # Skip empty segment
        if df.empty:
            continue

        original = df["source_file"].iloc[0]
        base = os.path.splitext(original)[0] 
        index = df['Segment_index'].iloc[0]
        new_name = f"{base}-fragment{index}-preprocessed.csv"
        save_worm_csv(df, new_name, segments_dir)