# Process true trajectories

Aggregate, process, plot some and save all true trajectories' features. 

---

# Imports

In [None]:
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm

In [None]:
plt.style.use("ggplot")
sns.set_context("paper")

# Concat individual video data

In [None]:
data_path = Path("analyses", "biotine_full")

In [None]:
all_csv_files = [
    f
    for f in data_path.rglob("*.csv")
    if f.stem != "Experiment" and not f.stem.startswith("features_through_time")
]
print(f"Found {len(all_csv_files)} csv files")

##  Add identifiers to know from which csv file the data comes from

In [None]:
all_smol_dfs = []


def process_file(f: Path):
    smol_df = pd.read_csv(f)
    smol_df["file"] = f.stem
    return smol_df


with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(process_file, all_csv_files), total=len(all_csv_files)))

all_smol_dfs.extend(results)

## Concat data

In [None]:
df = pd.concat(all_smol_dfs, ignore_index=True)
df

## Add the video ID & time to all rows

(it's only in the "Image" csv for now)

### Add the video ID

In [None]:
df["Metadata_VideoID"].value_counts(dropna=False)

In [None]:
mask = df["Metadata_VideoID"].isna()
df.loc[mask, "Metadata_VideoID"] = df["FileName_images"].str.split("_time_").str[0]

In [None]:
df["Metadata_VideoID"].value_counts(dropna=False)

### Add the time

In [None]:
df["Metadata_time"].value_counts(dropna=False)

In [None]:
mask = df["Metadata_time"].isna()
df.loc[mask, "Metadata_time"] = (
    df["FileName_images"].str.extract(r"_time_(.*?)\.")[0].astype(df["Metadata_time"].dtype)
)

In [None]:
df["Metadata_time"].value_counts(dropna=False)

# Basic analysis & statistics

In [None]:
list(df.columns)

In [None]:
# 119 full videos and one with 7 filtered frames
assert (nb_imgs := len(df["ImageNumber"].unique())) == (theory := 19 * 119 + 12), (
    f"{nb_imgs} != {theory}"
)

In [None]:
df["file"].value_counts(dropna=False)

## Lifetime of objects

In [None]:
df["TrackObjects_Lifetime_10"].value_counts(dropna=False).sort_index()

In [None]:
df["TrackObjects_FinalAge_10"].value_counts(dropna=False).sort_index()

# Select objects with full lifetime 

(ie final age 19)

## First assign a global object ID

### ObjectNumber analysis 

(not an actual object identifier) as not coherent through time)

In [None]:
assert not df["Metadata_VideoID"].isna().any()
print(df["ObjectNumber"].isna().sum())  # these are image files, it's ok
df[df["ObjectNumber"].isna()]["file"]

In [None]:
df["local_object_id"] = df["Metadata_VideoID"].astype(str) + "-" + df["ObjectNumber"].astype(str)
assert not df["local_object_id"].isna().any()
df["local_object_id"].value_counts(dropna=False)

In [None]:
plt.figure(figsize=(10, 10))
x, y = np.unique(df["local_object_id"].value_counts(dropna=False).values, return_counts=True)
sorted_indices = np.argsort(y)
y = y[sorted_indices]
x = x[sorted_indices]
plt.yticks(ticks=np.arange(len(x)), labels=x)
for i in range(3, len(x) + 3, 3):
    actual_y = np.where(x == i)[0][0]
    plt.axhspan(actual_y - 1 / 2 + 5e-2, actual_y + 1 / 2 - 5e-2, color="blue", alpha=0.3)
bars = plt.barh(np.arange(len(x)), y)
for bar in bars:
    plt.text(
        bar.get_width(),
        bar.get_y() + bar.get_height() / 2 - 0.1,
        bar.get_width(),
        va="center",
    )
plt.xlabel("Number of objects")
plt.ylabel("Number of rows in the dataset")
plt.tight_layout()
# plt.xscale("log")
plt.show()

### TrackObjects_Label_10 analysis

In [None]:
maks_trackobject_label_exists = df["TrackObjects_Label_10"].notna()
df.loc[maks_trackobject_label_exists, "global_object_id"] = (
    df["Metadata_VideoID"].astype(str) + "-" + df["TrackObjects_Label_10"].astype(str)
)
df["global_object_id"].value_counts(dropna=False)

In [None]:
plt.figure(figsize=(10, 10))
x, y = np.unique(df["global_object_id"].value_counts(dropna=False).values, return_counts=True)
sorted_indices = np.argsort(y)
y = y[sorted_indices]
x = x[sorted_indices]
plt.yticks(ticks=np.arange(len(x)), labels=x)
bars = plt.barh(np.arange(len(x)), y)
for bar in bars:
    plt.text(
        bar.get_width(),
        bar.get_y() + bar.get_height() / 2 - 0.1,
        bar.get_width(),
        va="center",
    )
plt.xlabel("Number of objects")
plt.ylabel("Number of rows in the dataset")
plt.tight_layout()
# plt.xscale("log")
plt.show()

`TrackObjects_Label_10` is the correct object ID

In [None]:
df_by_object_id_final_ages = df.groupby("global_object_id")["TrackObjects_FinalAge_10"].unique()
df_by_object_id_final_ages

In [None]:
df_by_object_id_parent_track_object = df.groupby("global_object_id")[
    "TrackObjects_ParentObjectNumber_10"
].unique()
df_by_object_id_parent_track_object

## Then filter on objects with full lifetime

Note that objects with full lifetime might still split / merge somehow, see CellProfiler TrackObject doc.

In [None]:
mask_objects_with_full_lifetime = df["TrackObjects_FinalAge_10"] == 19
objects_with_full_lifetime_ids = df.loc[
    mask_objects_with_full_lifetime, "global_object_id"
].unique()
print(len(objects_with_full_lifetime_ids))
objects_with_full_lifetime_ids  # these are all objects IDs with full lifetime (19)

In [None]:
df_full_lifetimes = df[df["global_object_id"].isin(objects_with_full_lifetime_ids)]
df_full_lifetimes

In [None]:
df_full_lifetimes["TrackObjects_FinalAge_10"].value_counts(dropna=False)

check consistency of filtered dataframe:

In [None]:
df_full_lifetimes[["file", "TrackObjects_FinalAge_10"]].value_counts(dropna=False)

We only have the whole_cell files in there, as there are the ones we tracked!

# Plot a feature of a cell

## Select video

In [None]:
video_id = "M_13_fld_2"

In [None]:
df_this_vid = df_full_lifetimes.query(f"Metadata_VideoID == '{video_id}'")
df_this_vid

## Select object

In [None]:
object_number = "274.0"

In [None]:
df_this_vid_this_object = df_this_vid.query(f"TrackObjects_Label_10 == {object_number}")
df_this_vid_this_object

See where is the object in the video

In [None]:
df_this_vid_this_object[
    ["global_object_id", "Metadata_time", "AreaShape_Center_X", "AreaShape_Center_Y"]
]

## Select feature

In [None]:
list(df_this_vid_this_object.columns)

In [None]:
feature = "AreaShape_Area"

In [None]:
plt.figure(dpi=200, figsize=(10, 5))

plt.plot(df_this_vid_this_object[feature].reset_index(drop=True))
plt.xticks(
    ticks=range(len(df_this_vid_this_object)),
    labels=range(1, len(df_this_vid_this_object) + 1),
)
plt.title(f"{feature} over time of whole cell {int(float(object_number))} in video {video_id}")
plt.ylabel(f"{feature}")
plt.xlabel("Time")
plt.legend(loc="upper right")
plt.show()

# Save timeseries of all features for all tracked objects

In [None]:
features_to_save = [feat for feat in df.columns if feat.startswith("AreaShape")]
print(f"Saving {len(features_to_save)} features")
features_to_save

In [None]:
list_dfs_to_concat: "list[pd.DataFrame]" = []


def process_object(object_id: str):
    df_this_object = df_full_lifetimes.query(f"global_object_id == '{object_id}'")
    features_this_object = []
    skip_this_object = False
    # run through all times
    for time in range(1, 20):
        df_this_time = df_this_object.query(f"Metadata_time == {time}")
        if len(df_this_time) > 1:
            skip_this_object = True
            break
        elif len(df_this_time) == 0:
            raise RuntimeError(
                f"Object {object_id} does not exist at time {time}; should have been filtered beforehand"
            )
        else:
            features_this_object_this_time = df_this_time[features_to_save].copy()
            features_this_object_this_time["global_object_id"] = object_id
            features_this_object_this_time["time"] = time
            features_this_object.append(features_this_object_this_time)
    # add this object if it's "simple": one "sub-object" per time
    if not skip_this_object:
        features_this_object = pd.concat(features_this_object, ignore_index=True)
        return features_this_object
    else:
        return None


with ProcessPoolExecutor() as executor:
    futures = {
        executor.submit(process_object, object_id): object_id
        for object_id in tqdm(
            df_full_lifetimes["global_object_id"].unique(), desc="Submitting tasks"
        )
    }
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing objects"):
        result = future.result()
        if result is not None:
            list_dfs_to_concat.append(result)


features_through_time = pd.concat(list_dfs_to_concat, ignore_index=True)
features_through_time

## Counts "simple" objects that were kept 

In [None]:
features_through_time["global_object_id"].nunique(dropna=False)

versus the total number of objects:

In [None]:
df_full_lifetimes["global_object_id"].nunique(dropna=False)

## Save

In [None]:
features_through_time.to_csv(
    data_path / "features_through_time_of_full_lifetime_simple_objects.csv", index=False
)