In [1]:
import glob
import pandas as pd
import re
import numpy as np
import os
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import sys

# What does this notebook do?

- The first part of the notebook provides the functions to pool multiple pairwise distance datasets from the same condition. 

- The second part, we apply some filters to the tracks to keep only good part of tracks

- The last part provides the functions to calculate the contact duration and the second passage time.


# Pooling datasets

In [2]:
basedir = "/tungstenfs/scratch/ggiorget/zhan/2021/1105_pia_image_analysis/3d_prediction/two_colours_lines/"

# list_dir = ["time_acquisition/20211015_two_colour_degron_1s_2d/"]
# outname = "two_colors_distance_2d_1s.csv"

# list_dir = ["time_acquisition/20211019_two_colour_degron_3_30s/"]
# outname = "two_colors_distance_30s.csv"

list_dir = [
    "20210921_two_colour_degron",
    "20210923_two_colour_degron",
    "20210927_two_colour_degron",
    "20210930_two_colour_degron",
]
outname = "two_colors_distance.csv"

list_dir = [f"{basedir}/{x}" for x in list_dir]

list_files = []

for directory in list_dir:
    list_files.append(glob.glob(f"{directory}/tracks_pairwise_with_cellid/*csv"))

list_files = [x for sub in list_files for x in sub]

In [3]:
df = pd.DataFrame()

for file in list_files:
    tmp = pd.read_csv(file)
    df = pd.concat([df, tmp])

df[["date", "cell_line", "induction_time", "rep", "motion_correction_type"]] = df[
    "filename"
].str.extract(
    r"(20[0-9]*)_[\w\W_]*?([^_]*)_([^_]*)_[\d]*?[perc_]*?([0-9])_[\w\W]*?_([\w]*)\.csv",
    expand=True,
)

today = datetime.datetime.now().strftime("%y%m%d")

df["condition"] = [
    f"{cl}_{time}" for cl, time in zip(df["cell_line"], df["induction_time"])
]

In [4]:
ntracks = df["uniqueid"].nunique()
av_length = np.mean(df["uniqueid"].value_counts().values)
av_distance = np.mean(df["distance"].values)
print(f"ntracks: {ntracks}")
print(f"av_length: {av_length}")
print(f"av_distance: {av_distance}")

ntracks: 560
av_length: 108.55
av_distance: 0.3143500327442566


# Apply filters to tracks 

In [5]:
min_trackpoints = 25


def filter_data(df: pd.DataFrame, min_points: int):
    """Filter tracks with lower number of points"""
    df_filtered = pd.DataFrame()
    for _, sub in df.groupby("uniqueid"):
        if len(sub) > min_points:
            df_filtered = pd.concat([df_filtered, sub])
    return df_filtered


# Filter noisy 3D data points

distance = np.sqrt(np.sum(np.square(df[["x", "y"]].values), axis=1))
distance = np.log2(df["distance"] / distance)
threshold = np.quantile(distance, 0.95)
df = df[distance < threshold]

df = filter_data(df, min_trackpoints)

In [6]:
ntracks = df["uniqueid"].nunique()
av_length = np.mean(df["uniqueid"].value_counts().values)
av_distance = np.mean(df["distance"].values)
print(f"ntracks: {ntracks}")
print(f"av_length: {av_length}")
print(f"av_distance: {av_distance}")

ntracks: 481
av_length: 117.3035343035343
av_distance: 0.29731840481692723


In [7]:
df.to_csv(f"{basedir}/{today}_{outname}.zip", index=False, compression="zip")

# Calculate the contact duration and second passage time

In [8]:
from utils import *

timeresolution = 1
max_ngap = 15

durations = pd.DataFrame()
second_passage_times = pd.DataFrame()

for distance in np.arange(0.1, 0.5, 0.05):
    duration, second_passage_time = contact_duration_second_passage_time_different_gaps(
        df=df, resolution=timeresolution, contact_cutoff=distance, max_ngap=max_ngap
    )
    duration["contact_cutoff"] = distance
    second_passage_time["contact_cutoff"] = distance
    durations = pd.concat([durations, duration])
    second_passage_times = pd.concat([second_passage_times, second_passage_time])

second_passage_times = second_passage_times.drop("index", axis=1)
durations = durations.drop("index", axis=1)

second_passage_times.to_csv(
    f"{basedir}/{today}_second_passage_time_{outname}.zip", index=False, compression="zip"
)
durations.to_csv(f"{basedir}/{today}_duration_{outname}.zip", index=False, compression="zip")