In [12]:
import glob
import pandas as pd
import re
import numpy as np
import os
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import sys

# What does this notebook do?

- The first part of the notebook provides the functions to pool multiple pairwise distance datasets from the same condition. 

- The second part, we apply some filters to the tracks to keep only good part of tracks



# Pooling datasets

In [13]:
basedir = "/tungstenfs/scratch/ggiorget/zhan/2021/1105_pia_image_analysis/3d_prediction/two_colours_lines/"

# list_dir = ["time_acquisition/20211015_two_colour_degron_1s_2d/"]
# outname = "two_colors_distance_2d_1s.csv"

# list_dir = ["time_acquisition/20211019_two_colour_degron_3_30s/"]
# outname = "two_colors_distance_30s.csv"

list_dir = [
    "20210921_two_colour_degron",
    "20210923_two_colour_degron",
    "20210927_two_colour_degron",
    "20210930_two_colour_degron",
    "20211118_two_colour_degron_10s",
    "20211119_two_colour_degron_10s",
    "20211121_two_colour_degron_10s",
    "20211122_two_colour_degron_10s",
    "20211125_two_colour_degron_10s",
]
outname = "two_colors_distance_10s.csv"

# list_dir = ["small_fluctuation_analysis/"]
# outname = "pairwise_twocolor_control.csv"

# list_dir = ["time_acquisition/20211117_two_colour_degron_60s/"]
# outname = "two_colors_distance_60s.csv"


list_dir = [f"{basedir}/{x}" for x in list_dir]

list_files = []

for directory in list_dir:
    list_files.append(glob.glob(f"{directory}/tracks_pairwise_with_cellid/*csv"))

list_files = [x for sub in list_files for x in sub]

In [14]:
df = pd.DataFrame()

for file in list_files:
    tmp = pd.read_csv(file)
    df = pd.concat([df, tmp])

df[["date", "cell_line", "induction_time", "rep", "motion_correction_type"]] = df[
    "filename"
].str.extract(
    r"(20[0-9]*)_[\w\W_]*?([^_]*)_([^_]*)_[\d]*?[perc_]*?([0-9])_[\w\W]*?_([\w]*)\.csv",
    expand=True,
)

today = datetime.datetime.now().strftime("%y%m%d")

df["condition"] = [
    f"{cl}_{time}" for cl, time in zip(df["cell_line"], df["induction_time"])
]

In [15]:
ntracks = df["uniqueid"].nunique()
av_length = np.mean(df["uniqueid"].value_counts().values)
av_distance = np.mean(df["distance"].values)
print(f"ntracks: {ntracks}")
print(f"av_length: {av_length}")
print(f"av_distance: {av_distance}")

ntracks: 1152
av_length: 112.59895833333333
av_distance: 0.4021123462835793


# Apply filters to tracks 

In [16]:
min_trackpoints = 25


def filter_data(df: pd.DataFrame, min_points: int):
    """Filter tracks with lower number of points"""
    df_filtered = pd.DataFrame()
    for _, sub in df.groupby("uniqueid"):
        if len(sub) > min_points:
            df_filtered = pd.concat([df_filtered, sub])
    return df_filtered


# Filter noisy 3D data points

distance = np.sqrt(np.sum(np.square(df[["x", "y"]].values), axis=1))
distance = np.log2(df["distance"] / distance)
threshold = np.quantile(distance, 0.95)
df = df[distance < threshold]

df = filter_data(df, min_trackpoints)

In [17]:
ntracks = df["uniqueid"].nunique()
av_length = np.mean(df["uniqueid"].value_counts().values)
av_distance = np.mean(df["distance"].values)
print(f"ntracks: {ntracks}")
print(f"av_length: {av_length}")
print(f"av_distance: {av_distance}")

ntracks: 1010
av_length: 119.62079207920792
av_distance: 0.38551074946019626


In [18]:
print("Number of unique tracks per condition")
df.groupby('condition')['uniqueid'].nunique()

Number of unique tracks per condition


condition
1A2_0min        96
1A2_120min      88
1B1_0min       145
1B1_120min     111
1D12_0min       76
1D12_120min     87
1F4_0min        92
1F4_120min      88
2C11_0min      121
2C11_120min    106
Name: uniqueid, dtype: int64

In [19]:
print("Average distance per line")
df.groupby('condition')['distance'].mean()

Average distance per line


condition
1A2_0min       0.414913
1A2_120min     0.522210
1B1_0min       0.260385
1B1_120min     0.363328
1D12_0min      0.410088
1D12_120min    0.515105
1F4_0min       0.412434
1F4_120min     0.492662
2C11_0min      0.229616
2C11_120min    0.384764
Name: distance, dtype: float64

In [11]:
df.to_csv(f"{basedir}/{today}_{outname}.zip", index=False, compression="zip")