# Hidden markov model (HMM)

The goal is to use to find looped and unlooped states in the data generated by Pavel's model.

We will use Gaussian model for emission. 
Given all the data we fit the all the parameters (gaussian parameters, transition probability between looped/unlooped).

Once we have the model, we use the model to predict the states.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from hmmlearn import hmm
from matplotlib.backends.backend_pdf import PdfPages
from utils import *
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import glob
import pathlib
import gdown
import os
import zipfile
import matplotlib
import matplotlib.colors as mcolors
matplotlib.rcParams['pdf.fonttype'] = 42

In [None]:
reproducible = True
distance_conversion = (0.012*np.sqrt(40)) # 1 a.u. = 0.08 um
rolling = 1
if reproducible:
    np.random.seed(42)
subsample = False

if subsample:
    number_subsample = 200000


# Reading simulated data

Extruder speed: <br>
17500 -- 1 kb/s <br>
175000 -- 0.1 kb/s <br>

Loading rate: <br>
arbitrary units multiply by 6 to convert to (1/Mb*min) <br>

Unloading rate: <br>
0.1 -- 5.5 min <br>
0.01 -- 55 min <br>
etc. <br>

In [None]:
dataset_name = "langevin_release_on-off_220125"
basedir = f"./{dataset_name}/"
#create a folder
pathlib.Path(basedir).mkdir(parents=True, exist_ok=True)
# polymer simulations, single loop dataset
sample_link = "https://drive.google.com/uc?export=download&id=1bubsOltpWp8HbkIi4kLcKP5q1HXa0JQ4"
# download data if not already present
if not os.path.isfile(basedir+f"{dataset_name}.zip"):
    gdown.download(sample_link, basedir+f"{dataset_name}.zip")

#unzip the data
with zipfile.ZipFile(basedir+f"{dataset_name}.zip", 'r') as zip_ref:
    zip_ref.extractall(basedir)
list_files = glob.glob(f"{basedir}/*/*/*/*/*/dst_bnd.dat")

filenames = pd.DataFrame(list_files, columns=["filename"])
filenames[["ctcf", "speed", "loading", "unloading", "rep"]] = filenames["filename"].str.extract(
    r"langevin_release_on-off_220125\/([\w+]*)\/([\d+]*)\/([\d]\.[\d+]*)\/([\d]\.[\d+]*)\/([\d+]*)\/",
    expand=True,
)
filenames["condition"] = filenames["speed"] + "." + filenames["loading"] + "_" + filenames["unloading"]
filenames["uniqueid"] = (
    "ctcf"
    + filenames["ctcf"]
    + "."
    + "rad21"
    + "on"
    + "_"
    + filenames["speed"]
    + "."
    + filenames["loading"]
    + "."
    + filenames["unloading"]
    + "."
    + filenames["rep"]
)

In [None]:
alldf = []
for condition, sub2 in filenames.groupby('condition'):
    for uniqueid, sub1 in sub2.groupby('uniqueid'):
        for ct, sub in sub1.groupby('ctcf'):
            for file in sub.filename.unique():
                tmp = pd.read_csv(file, sep=" ")
                tmp.distance = tmp.distance * distance_conversion
                try:
                    tmp.drop(["x1", "y1", "z1", "x2", "y2", "z2"], axis =1, inplace=True)
                except:
                    pass
                tmp['condition'] = condition
                tmp['frame'] = np.arange(len(tmp))
                tmp['uniqueid'] = uniqueid 
                tmp['ctcf'] = ct
                tmp['rad21'] = "on"
                if subsample:
                    tmp = tmp.head(number_subsample)
                alldf.append(tmp)
distances_original = pd.concat(alldf)

# Subsampling  to 30s and training 2 state hmm

In [None]:
# choose neighbors
nneighbor = 2
subsample_every = 1

distances = distances_original.iloc[::subsample_every].copy()
distances.bond = 1 - distances.bond.values
distances_nonoise = distances.copy()
# add experimental noise
seed = 0
np.random.seed(seed)
distances.distance = distances.distance.values + np.random.normal(
    loc=0.0, scale=0.064, size=len(distances)
)
distances.frame = distances.frame.values / subsample_every

In [None]:
# run HMM training on all conditions
results = {}
np.random.seed(seed)
for condition, sub in distances.groupby("condition"):
    traj2d = np.reshape(
        sub[sub.ctcf == "on"].distance.to_numpy(),
        (-1, 1),
    )
    model = hmm.GaussianHMM(
        n_components=2,
        covariance_type="full",
        min_covar=0.1,
        n_iter=10000,
        params="mtc",
        init_params="mtc",
    )
    model.startprob_ = [0.5, 0.5]
    model.fit(traj2d)

    results[condition] = model
models2hmm = results.copy()

In [None]:
# reorder HMM model such that the first gaussian is always the lowest
for key, model in results.items():
    newmodel = reorder_hmm_model_parameters(model)
    model = newmodel
    means = (model.means_)
    sigmas = np.sqrt(model.covars_.squeeze())
    w = np.array(model.transmat_)
    logProb = model.score(traj2d)
    print(f"ctcf-speed-loading-unloading = {key}")
    print(f"Gaussian means: {means}")
    print(f"Gaussian std: {sigmas}")
    print(f"Transition rates: {w}")
    print("----------\n")

# Reading experimental data

In [None]:
# experiments

dataset = "211221_two_colors_distance_30s.csv"
basedir = f"./{dataset}/"
#create a folder
pathlib.Path(basedir).mkdir(parents=True, exist_ok=True)
sample_link = "https://drive.google.com/uc?export=download&id=1szF0P4OcA0X8WoPha5CKjo0sCLhJnrtX"
if not os.path.isfile(f"{basedir}{dataset}.zip"):
    gdown.download(sample_link, f"{basedir}{dataset}.zip")
#unzip the data
with zipfile.ZipFile(f"{basedir}{dataset}.zip", 'r') as zip_ref:
    zip_ref.extractall(basedir)
    
list_cell_lines = ["1A2", "1B1"]
bins = np.arange(30, 10000, 150)

#download pretrained hmm model
gdown.download(
    "https://drive.google.com/uc?export=download&id=1oGTB_Ml4RQpwCeHackjYd2Tc_MDd6sqm", 
    f"{basedir}hmmmodel_two_colors_distance_30s.obj",
)
with open(f"{basedir}/hmmmodel_two_colors_distance_30s.obj", "rb") as f:
    exp_model = pickle.load(f)

exp = pd.read_csv(f"{basedir}{dataset}")
exp = exp[exp.cell_line.isin(list_cell_lines)]
exp["condition"] = exp["cell_line"] + "_" + exp["induction_time"]

hue_order = sorted(exp["condition"].unique())
(
    exp_durations,
    exp_second_passage_times,
    exp_frequencies,
    exp_fraction_time,
    exp_conditions,
    exp_data_filtered,
) = calculate_duration_second_passage_time(
    data=exp,
    resolution=30,
    model=exp_model,
    fraction_nan_max=0.2,
)


In [None]:
appo = pd.DataFrame(1- exp_data_filtered.groupby("condition").prediction.mean())
ft_noCTCF_yescohesin = appo.prediction[appo.index == "1A2_0min"].values[0]
ft_noCTCF_nocohesin = appo.prediction[appo.index == "1A2_120min"].values[0]
ft_yesCTCF_yescohesin = appo.prediction[appo.index == "1B1_0min"].values[0]
ft_yesCTCF_nocohesin = appo.prediction[appo.index == "1B1_120min"].values[0]

In [None]:
appo = pd.DataFrame(exp_durations.groupby("condition").contact_duration.mean())
duration_noCTCF_yescohesin = appo.contact_duration[appo.index == "1A2_0min"].values[0]
duration_noCTCF_nocohesin = appo.contact_duration[appo.index == "1A2_120min"].values[0]
duration_yesCTCF_yescohesin = appo.contact_duration[appo.index == "1B1_0min"].values[0]
duration_yesCTCF_nocohesin = appo.contact_duration[appo.index == "1B1_120min"].values[0]

In [None]:
appo = pd.DataFrame(exp_second_passage_times.groupby("condition").second_passage_time.mean())
spt_noCTCF_yescohesin = appo.second_passage_time[appo.index == "1A2_0min"].values[0]
spt_noCTCF_nocohesin = appo.second_passage_time[appo.index == "1A2_120min"].values[0]
spt_yesCTCF_yescohesin = appo.second_passage_time[appo.index == "1B1_0min"].values[0]
spt_yesCTCF_nocohesin = appo.second_passage_time[appo.index == "1B1_120min"].values[0]

# Find best case to reproduce experimental data in +cohesin condition +/- CTCF

### calculate the mean contact duration and second passage time across all parameters

In [None]:
mean_lst = []# = pd.DataFrame()
alldata_lst = []# = pd.DataFrame()
sim_duration_distributions = pd.DataFrame()
for cond, distances_selected in distances.groupby("condition"):
    distances_selected.condition = distances_selected.uniqueid.map(lambda x: str(x)[:-2])
    (
        durations,
        second_passage_times,
        frequencies,
        fraction_time,
        conditions,
        data,
    ) = calculate_duration_second_passage_time(
        distances_selected, resolution=30, model=results[cond]
    )
    tmp = pd.DataFrame(
        durations.groupby("cell_line").mean()["contact_duration"]
    ).reset_index()
    tmp["second_passage_time"] = (
        second_passage_times.groupby("cell_line").mean()["second_passage_time"].values
    )
    tmp["frequency"] = 1/(frequencies.groupby("cell_line").mean()["frequency"].values) * 1000
    tmp["condition"] = cond
    mean_lst.append(tmp)
    alldata_lst.append(data)
mean = pd.concat(mean_lst)
alldata = pd.concat(alldata_lst)

mean[["speed", "loading", "unloading"]] = mean["condition"].str.extract(
    r"([\d+]*)\.([\d]\.[\d+]*)_([\d]\.[\d+]*)", expand=True
)
mean["ctcf"] = mean["cell_line"]

### Extracting gaussian means of HMM model from simulated data

In [None]:
gaussian_means = pd.DataFrame()

for key, model in results.items():
    tmp = pd.DataFrame(model.means_, columns=["gaussian_means"])
    tmp["state"] = ["looped", "unlooped"]
    tmp["condition"] = key
    gaussian_means = pd.concat([gaussian_means, tmp])
gaussian_means[["speed", "loading", "unloading"]] = gaussian_means[
    "condition"
].str.extract(r"([\d+]*)\.([\d]\.[\d+]*)_([\d]\.[\d+]*)", expand=True)

### calculate distance of gaussian means between simulation and experiments

In [None]:
dist_gaussian_means_looped = gaussian_means[["gaussian_means", "condition"]][gaussian_means.state == "looped"]
dist_gaussian_means_unlooped = gaussian_means[["gaussian_means", "condition"]][gaussian_means.state == "unlooped"]
dist_gaussian_means_looped["gaussian_means"] = np.abs(dist_gaussian_means_looped["gaussian_means"].values - 0.1493922) / 0.1493922
dist_gaussian_means_unlooped["gaussian_means"] = np.abs(dist_gaussian_means_unlooped["gaussian_means"].values - 0.28807371) / 0.28807371

dist_gaussian_means = pd.merge(dist_gaussian_means_looped, dist_gaussian_means_unlooped, on="condition")
# since we want to keep only the looped state
dist_gaussian_means["dist_gaussian_means"] = dist_gaussian_means.gaussian_means_x

### Plot Gaussian mean as heatmap

In [None]:
pdf = PdfPages(f"{dataset_name}_2states_HMM_neighbor_{nneighbor}.pdf")

speed = "17500"
exp_looped = 0.1493922
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", [(1, 1, 1),(69/255, 83/255, 162/255),(1, 1, 1)])
norm = mcolors.TwoSlopeNorm(vmin=exp_looped*0.5, vcenter=exp_looped, vmax=exp_looped*1.5)
fig, ax = plt.subplots(1,2, figsize = (12, 5))
sns.heatmap(
    gaussian_means[(gaussian_means.state=="looped") & (gaussian_means.speed == speed)].pivot("loading", "unloading", "gaussian_means"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "gaussian_means looped"},
    norm = norm, 
    ax = ax[0]
)
ax[0].set_title(f"exp value looped {exp_looped}")

exp_unlooped = 0.28807371
norm = mcolors.TwoSlopeNorm(vmin=exp_unlooped*0.5, vcenter=exp_unlooped, vmax=exp_unlooped*1.5)
sns.heatmap(
    gaussian_means[(gaussian_means.state=="unlooped") & (gaussian_means.speed == speed)].pivot("loading", "unloading", "gaussian_means"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "gaussian_means looped"},
    norm = norm, 
    ax = ax[1]
)
ax[1].set_title(f"exp value unlooped {exp_unlooped}")
plt.suptitle(f"Gaussian means speed: {speed}")
plt.show()
pdf.savefig(fig)

### Plot average first passage time

In [None]:
speed = "17500"
ctcf_off = spt_noCTCF_yescohesin
norm = mcolors.TwoSlopeNorm(vmin=ctcf_off*0, vcenter=ctcf_off, vmax=ctcf_off*2)
fig, ax = plt.subplots(1,2, figsize = (20, 8))
sns.heatmap(
    mean[(mean.ctcf=="ctcfoff.rad21on") & (mean.speed == speed)].pivot("loading", "unloading", "second_passage_time"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "CTCF off"},
    norm = norm, 
    annot_kws={"size":8},
    ax = ax[0]
)
ax[0].set_title(f"exp value {ctcf_off}")

ctcf_on = spt_yesCTCF_yescohesin
norm = mcolors.TwoSlopeNorm(vmin=ctcf_on*0, vcenter=ctcf_on, vmax=ctcf_on*2)
sns.heatmap(
    mean[(mean.ctcf=="ctcfon.rad21on") & (mean.speed == speed)].pivot("loading", "unloading", "second_passage_time"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "CTCF on"},
    norm = norm, 
    annot_kws={"size":8},
    ax = ax[1]
)
ax[1].set_title(f"exp value {ctcf_on}")
plt.suptitle("Second_passage_time")
plt.show()
pdf.savefig(fig)

### Plot average contact duration

In [None]:
speed = "17500"
ctcf_off = duration_noCTCF_yescohesin
norm = mcolors.TwoSlopeNorm(vmin=ctcf_off*0, vcenter=ctcf_off, vmax=ctcf_off*2)
fig, ax = plt.subplots(1,2, figsize = (20, 8))
sns.heatmap(
    mean[(mean.ctcf=="ctcfoff.rad21on") & (mean.speed == speed)].pivot("loading", "unloading", "contact_duration"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "CTCF off"},
    norm = norm, 
    annot_kws={"size":8},
    ax = ax[0]
)
ax[0].set_title(f"exp value {ctcf_off}")

ctcf_on = duration_yesCTCF_yescohesin
norm = mcolors.TwoSlopeNorm(vmin=ctcf_on*0, vcenter=ctcf_on, vmax=ctcf_on*2)
sns.heatmap(
    mean[(mean.ctcf=="ctcfon.rad21on") & (mean.speed == speed)].pivot("loading", "unloading", "contact_duration"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "CTCF on"},
    norm = norm, 
    annot_kws={"size":8},
    ax = ax[1]
)
ax[1].set_title(f"exp value {ctcf_on}")
plt.suptitle("Contact duration")
plt.show()
pdf.savefig(fig)

### calculate distance between contact duration between simulation and experiments

In [None]:
dist_contact_duration = []
conditions = []

for cond, sub in mean.groupby("condition"):
    conditions.append(cond)
    appo = np.abs(sub.contact_duration[sub.cell_line == "ctcfoff.rad21on"].values - ctcf_off)/ctcf_off
    appo = appo + np.abs(sub.contact_duration[sub.cell_line == "ctcfon.rad21on"].values - ctcf_on)/ctcf_on
    dist_contact_duration.append(appo)

dist_contact_duration = pd.DataFrame(dist_contact_duration, columns = ["dist_contact_duration"])
dist_contact_duration["condition"] = conditions

### HMM fraction of time spend in each looped across all parameters

In [None]:
conditions = np.unique(
    [
        x.replace("ctcfon.", "").replace("ctcfoff.", "")
        for x in alldata.condition.unique()
    ]
)

lst_time_spent = []
for cond in conditions:
    subset_on = alldata[
        ["ctcfon." + cond in x for x in alldata.condition]
    ].copy()
    subset_off = alldata[
        ["ctcfoff." + cond in x for x in alldata.condition]
    ].copy()
    
    time = pd.DataFrame(
    [
        1 - subset_on.prediction.mean(),
        1 - subset_off.prediction.mean(),
    ],
    columns=["time"],
    )

    time["ctcf"] = ["on", "off"]
    time["type"] = cond
    lst_time_spent.append(time)
time_spent = pd.concat(lst_time_spent)

time_spent[["speed", "loading", "unloading"]] = time_spent["type"].str.extract(
    r"([\d+]*)\.([\d]\.[\d+]*)\.([\d]\.[\d+]*)", expand=True
)

speed = "17500"
ctcf_off = ft_noCTCF_yescohesin
norm = mcolors.TwoSlopeNorm(vmin=ctcf_off*0.5, vcenter=ctcf_off, vmax=ctcf_off*1.3)
fig, ax = plt.subplots(1,2, figsize = (12, 5))
sns.heatmap(
    time_spent[(time_spent.ctcf=="off") & (time_spent.speed == speed)].pivot("loading", "unloading", "time"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "CTCF off"},
    norm = norm, 
    annot_kws={"size":8},
    ax = ax[0]
)
ax[0].set_title(f"exp value {ctcf_off}")

ctcf_on = ft_yesCTCF_yescohesin
norm = mcolors.TwoSlopeNorm(vmin=ctcf_on*0.5, vcenter=ctcf_on, vmax=ctcf_on*1.3)
sns.heatmap(
    time_spent[(time_spent.ctcf=="on") & (time_spent.speed == speed)].pivot("loading", "unloading", "time"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "CTCF on"},
    norm = norm, 
    annot_kws={"size":8},
    ax = ax[1]
)
ax[1].set_title(f"exp value {ctcf_on}")
plt.suptitle(f"Fraction time spent in looped state; speed {speed}")
plt.show()
pdf.savefig(fig)

### effect of CTCF in time spent in the looped state across all parameters

In [None]:
ctcfon = time_spent[(time_spent.ctcf=="on") ].copy()
ctcfoff = time_spent[(time_spent.ctcf=="off") ].copy()

ctcf = pd.merge(ctcfon, ctcfoff, on = ["loading", "unloading", "speed"])
ctcf["enrichment"] = ctcf["time_x"] / ctcf["time_y"]
ctcf["difference"] =  ctcf["time_x"] - ctcf["time_y"]

speed="17500"

enrichment =  ft_yesCTCF_yescohesin/ ft_noCTCF_yescohesin
norm = mcolors.TwoSlopeNorm(vmin=enrichment*0.5, vcenter=enrichment, vmax=enrichment*2)
fig, ax = plt.subplots(1,2, figsize = (12, 5))
sns.heatmap(
    ctcf[ctcf.speed == speed].pivot("loading", "unloading", "enrichment"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "Fold change (+ vs - ctcf)"},
    norm = norm, 
    annot_kws={"size":8},
    ax = ax[0]
   
)

ax[0].set_title(f"Exp value {enrichment}")

difference = ft_yesCTCF_yescohesin- ft_noCTCF_yescohesin
norm = mcolors.TwoSlopeNorm(vmin=difference*0.5, vcenter=difference, vmax=difference*2)
sns.heatmap(
    ctcf[ctcf.speed == speed].pivot("loading", "unloading", "difference"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "Difference (+ vs - ctcf)"},
    norm = norm, 
    annot_kws={"size":8},
    ax = ax[1]
   
)

ax[1].set_title(f"Exp value {difference}")



plt.suptitle(f"Change in  time spent in looped state (+ vs -ctcf); exp value {enrichment}; speed {speed}")
plt.show()
pdf.savefig(fig)

### Find the best case

In [None]:

dist_ctcf_enrichment = ctcf.copy()
dist_ctcf_enrichment["dist_ctcf_enrichment"] = np.abs(
    dist_ctcf_enrichment.time_x - ft_yesCTCF_yescohesin
) /ft_yesCTCF_yescohesin  + np.abs(dist_ctcf_enrichment.time_y - ft_yesCTCF_nocohesin) / ft_yesCTCF_nocohesin

dist_gaussian_means[
    "newcondition"
] = "rad21on_" + dist_gaussian_means.condition.str.replace("_", ".")
dist_contact_duration[
    "newcondition"
] = "rad21on_" + dist_contact_duration.condition.str.replace("_", ".")

dist_together = pd.merge(
    dist_gaussian_means, dist_ctcf_enrichment, left_on="newcondition", right_on="type_x"
)
dist_together = pd.merge(dist_together, dist_contact_duration, on="newcondition")
dist_together["dist"] = (
    2*dist_together.dist_gaussian_means
    + dist_together.dist_ctcf_enrichment
    + dist_together.dist_contact_duration
)

dist_together[["speed", "loading", "unloading"]] = dist_together[
    "condition_x"
].str.extract(r"([\d+]*)\.([\d]\.[\d+]*)_([\d]\.[\d+]*)", expand=True)

minimum = dist_together.dist.min()

fig, ax = plt.subplots(1,2, figsize = (12, 5))
speed = "17500"
cmap = "Greens_r"

sns.heatmap(
    dist_together[dist_together.speed == speed].pivot("loading", "unloading", "dist"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "Deviation from experimental data"},
    vmin=0,
    vmax=8,
    annot_kws={"size": 8},
    ax = ax[0]
)
ax[0].set_title(f"speed {speed}")

speed = "175000"
sns.heatmap(
    dist_together[dist_together.speed == speed].pivot("loading", "unloading", "dist"),
    annot=True,
    cmap=cmap,
    cbar_kws={"label": "Deviation from experimental data"},
    vmin=0,
    vmax=8,
    annot_kws={"size": 8},
    ax = ax[1]
)
ax[1].set_title(f"speed {speed}")

plt.suptitle("Best combination to match experimental data")
pdf.savefig(fig)

In [None]:
top = 5 #number of best matches
dist_together["realistic"] = (
    dist_together["speed"]
    + "."
    + dist_together["loading"]
    + "."
    + dist_together["unloading"]
)

realistic_list = dist_together.sort_values('dist').head(top).realistic.values
realistic_model_list = dist_together.sort_values('dist').head(top).condition_x.values

realistic_best = realistic_list[0]
realistic_model_best = realistic_model_list[0]

### Select best -cohesin case
This case should contain small amount of cohesin similar to experimental system but the unloading properties shouldn't be affected.

In [None]:
best_nocohesin = '17500.0.002_0.01'

# Select best datasets

### This part can be removed when we have a consistent dataset formats

Just keep this part and move it to the top reading file part

                try:
                    tmp.drop(["x1", "y1", "z1", "x2", "y2", "z2"], axis =1, inplace=True)
                except:
                    pass

In [None]:
dataset_name = "langevin_release_on-off_220125"
basedir = f"./{dataset_name}/"
#create a folder
pathlib.Path(basedir).mkdir(parents=True, exist_ok=True)
# polymer simulations, single loop dataset
sample_link = "https://drive.google.com/uc?export=download&id=1bubsOltpWp8HbkIi4kLcKP5q1HXa0JQ4"
# download data if not already present
if not os.path.isfile(basedir+f"{dataset_name}.zip"):
    gdown.download(sample_link, basedir+f"{dataset_name}.zip")
#unzip the data
with zipfile.ZipFile(basedir+f"{dataset_name}.zip", 'r') as zip_ref:
    zip_ref.extractall(basedir)
list_files = glob.glob(f"{basedir}/*/*/*/*/*/dst_bnd.dat")

filenames = pd.DataFrame(list_files, columns=["filename"])
filenames[["ctcf", "speed", "loading", "unloading", "rep"]] = filenames["filename"].str.extract(
    fr"{dataset_name}\/([\w+]*)\/([\d+]*)\/([\d]\.[\d+]*)\/([\d]\.[\d+]*)\/([\d+]*)\/",
    expand=True,
)
filenames["condition"] = filenames["speed"] + "." + filenames["loading"] + "_" + filenames["unloading"]
filenames["uniqueid"] = (
    "ctcf"
    + filenames["ctcf"]
    + "."
    + "rad21"
    + "on"
    + "_"
    + filenames["speed"]
    + "."
    + filenames["loading"]
    + "."
    + filenames["unloading"]
    + "."
    + filenames["rep"]
)


best_filenames = filenames[filenames.condition.isin(np.append(realistic_model_list, best_nocohesin))]
alldf = []
for condition, sub2 in best_filenames.groupby('condition'):
    print(condition)
    for uniqueid, sub1 in sub2.groupby('uniqueid'):
        for ct, sub in sub1.groupby('ctcf'):
            for file in sub.filename.unique():
                tmp = pd.read_csv(file, sep=" ")
                try:
                    tmp.drop(["x1", "y1", "z1", "x2", "y2", "z2"], axis =1, inplace=True)
                except:
                    pass
                tmp.distance = tmp.distance * distance_conversion
                tmp['condition'] = condition
                tmp['frame'] = np.arange(len(tmp))
                tmp['uniqueid'] = uniqueid 
                tmp['ctcf'] = ct
                tmp['rad21'] = "on"
                if subsample:
                    tmp = tmp.head(number_subsample)
                alldf.append(tmp)
distances_best_original = pd.concat(alldf)

In [None]:
# choose neighbors
nneighbor = 2
subsample_every = 1

#idx = np.where(
#    np.array([abs(eval(x)) for x in distances_best_original.columns[1:][:-5]])
#    < 18 - nneighbor
#)[0]
#names_remove = distances_best_original.columns[idx + 1]
#
## subsampling and remove too far neighbors 
distances = distances_best_original.iloc[::subsample_every].copy()
#
## calculate the GT ctcf mediated loop
#idx = (
#    np.where(
#        np.array([abs(eval(x)) for x in distances.columns[1:][:-5]]) >= 18 - nneighbor
#    )[0]
#    + 1
#)
#distances["bond"] = distances.iloc[:, idx].sum(axis=1)
#distances.loc[distances.bond > 1, "bond"] = 1
#distances.bond = 1 - distances.bond.values

distances_nonoise = distances.copy()

# add experimental noise
seed = 0
np.random.seed(seed)
distances.distance = distances.distance.values + np.random.normal(
    loc=0.0, scale=0.064, size=len(distances)
)
distances.frame = distances.frame.values / subsample_every

### fraction of time spent in each state in the best case

In [None]:
distances_control = distances[distances.condition == best_nocohesin]

traj = distances_control.distance.values.reshape(-1, 1)
time_average = pd.DataFrame()
for realistic_select, realistic_model_select in zip(
    realistic_list, realistic_model_list
):
    m = results[realistic_model_select]
    distances_control = distances_control.assign(prediction = m.predict(traj))

    subset_on = alldata[
        ["ctcfon.rad21on_" + realistic_select in x for x in alldata.condition]
    ].copy()
    subset_off = alldata[
        ["ctcfoff.rad21on_" + realistic_select in x for x in alldata.condition]
    ].copy()

    time = pd.DataFrame(
        [
            1 - subset_on.prediction.mean(),
            1 - subset_on.bond.mean(),
            1 - subset_off.prediction.mean(),
            1 - subset_off.bond.mean(),
            1 - distances_control[distances_control.ctcf == "on"].prediction.mean(),
            1 - distances_control[distances_control.ctcf == "on"].bond.mean(),
            1 - distances_control[distances_control.ctcf == "off"].prediction.mean(),
            1 - distances_control[distances_control.ctcf == "off"].bond.mean(),
            subset_on.prediction.mean(),
            subset_on.bond.mean(),
            subset_off.prediction.mean(),
            subset_off.bond.mean(),
            distances_control[distances_control.ctcf == "on"].prediction.mean(),
            distances_control[distances_control.ctcf == "on"].bond.mean(),
            distances_control[distances_control.ctcf == "off"].prediction.mean(),
            distances_control[distances_control.ctcf == "off"].bond.mean(),
        ],
        columns=["time"],
    )

    time["ctcf"] = [
        "on",
        "on",
        "off",
        "off",
        "on",
        "on",
        "off",
        "off",
        "on",
        "on",
        "off",
        "off",
        "on",
        "on",
        "off",
        "off",
    ]
    time["type"] = [
        "pred",
        "gt",
        "pred",
        "gt",
        "pred",
        "gt",
        "pred",
        "gt",
        "pred",
        "gt",
        "pred",
        "gt",
        "pred",
        "gt",
        "pred",
        "gt",
    ]
    time["cohesin"] = [
        "on",
        "on",
        "on",
        "on",
        "off",
        "off",
        "off",
        "off",
        "on",
        "on",
        "on",
        "on",
        "off",
        "off",
        "off",
        "off",
    ]
    time["condition"] = (
        "cohesin" + time["cohesin"] + "_ctcf" + time["ctcf"] + "_" + time["type"]
    )
    time["states"] = [
        "looped",
        "looped",
        "looped",
        "looped",
        "looped",
        "looped",
        "looped",
        "looped",
        "unlooped",
        "unlooped",
        "unlooped",
        "unlooped",
        "unlooped",
        "unlooped",
        "unlooped",
        "unlooped",
    ]
    time["index"] = np.arange(0, len(time))
    
    # plot bootstrap ci for best case
    if realistic_select == realistic_best:
        ddsets = [
            1 - subset_on.prediction,
            1 - subset_on.bond,
            1 - subset_off.prediction,
            1 - subset_off.bond,
            1 - distances_control[distances_control.ctcf == "on"].prediction,
            1 - distances_control[distances_control.ctcf == "on"].bond,
            1 - distances_control[distances_control.ctcf == "off"].prediction,
            1 - distances_control[distances_control.ctcf == "off"].bond,
            subset_on.prediction,
            subset_on.bond,
            subset_off.prediction,
            subset_off.bond,
            distances_control[distances_control.ctcf == "on"].prediction,
            distances_control[distances_control.ctcf == "on"].bond,
            distances_control[distances_control.ctcf == "off"].prediction,
            distances_control[distances_control.ctcf == "off"].bond,
        ]
        stds = []
        for dataset in ddsets:
            tmp = []
            for i in range(10000):
                tmp.append(
                    dataset.iloc[
                        np.random.choice(
                            range(len(dataset)), size=int(len(dataset)*0.05), replace=True
                        )
                    ].mean()
                )
            stds.append(np.std(tmp))
        time["std"] = stds
        
        means = time.pivot(index="states", columns="condition", values="time").T
        errors = time.pivot(index="states", columns="condition", values="std").T
        print(means, errors)
        fig = means.plot(kind="bar", yerr=errors, stacked=True)
        plt.ylabel("Fraction of time spent in looped state")
        plt.title(f"Best case {realistic_best}")
        pdf.savefig(fig.figure, bbox_inches="tight")
        time.drop("std", axis=1, inplace=True)
    time_average = pd.concat([time, time_average])

std = time_average.groupby(list(time_average.columns[1:])).std().reset_index()
av = time_average.groupby(list(time_average.columns[1:])).mean().reset_index()
av_std = pd.merge(std, av, on=list(time_average.columns[1:]))
av_std.columns = [
    "ctcf",
    "type",
    "cohesin",
    "condition",
    "states",
    "index",
    "std",
    "mean",
]

means = av_std.pivot(index="states", columns="condition", values="mean").T
errors = av_std.pivot(index="states", columns="condition", values="std").T
fig = means.plot(kind="bar", yerr=errors, stacked=True)

plt.ylabel("Fraction of time spent in looped state")
plt.title(f"average from {realistic_list}")
pdf.savefig(fig.figure, bbox_inches="tight")

##### p-value

In [None]:
list_conditions = durations.condition.unique()
for i in range(len(list_conditions)):
    for j in range(i + 1, len(list_conditions)):
        test = scipy.stats.ttest_ind(
            durations.contact_duration[durations.condition == list_conditions[i]].values,
            durations.contact_duration[durations.condition == list_conditions[j]].values,
        )
        print(f"{list_conditions[i]} vs {list_conditions[j]} p-value pfrom two sided t.test {test.pvalue}")

# fraction of time in looped state in exp and best case of simulation

In [None]:
time_average = pd.DataFrame()
for realistic_select in realistic_list:

    subset_on = alldata[
        ["ctcfon.rad21on_" + realistic_select in x for x in alldata.condition]
    ].copy()
    subset_off = alldata[
        ["ctcfoff.rad21on_" + realistic_select in x for x in alldata.condition]
    ].copy()
    time = pd.DataFrame(
        [
            1 - subset_on.prediction.mean(),
            1 - subset_off.prediction.mean(),
            1 - distances_control[distances_control.ctcf == "on"].prediction.mean(),
            1 - distances_control[distances_control.ctcf == "off"].prediction.mean(),
            1
            - exp_data_filtered[
                exp_data_filtered.condition == "1B1_0min"
            ].prediction.mean(),
            1
            - exp_data_filtered[
                exp_data_filtered.condition == "1A2_0min"
            ].prediction.mean(),
            1
            - exp_data_filtered[
                exp_data_filtered.condition == "1B1_120min"
            ].prediction.mean(),
            1
            - exp_data_filtered[
                exp_data_filtered.condition == "1A2_120min"
            ].prediction.mean(),
        ],
        columns=["time"],
    )

    time["ctcf"] = ["on", "off", "on", "off", "on", "off", "on", "off"]
    time["type"] = ["sim", "sim", "sim", "sim", "exp", "exp", "exp", "exp"]
    time["cohesin"] = ["on", "on", "off", "off", "on", "on", "off", "off"]
    time["condition"] = "cohesin" + time["cohesin"] + "_ctcf" + time["ctcf"]
    time["index"] = np.arange(0, len(time))
    if realistic_select == realistic_best:
        print(f"{realistic_best} ####")
        print(time)
        print(f"\n")
        ddsets = [
            1 - subset_on.prediction,
            1 - subset_off.prediction,
            1 - distances_control[distances_control.ctcf == "on"].prediction,
            1 - distances_control[distances_control.ctcf == "off"].prediction,
            1 - exp_data_filtered[exp_data_filtered.condition == "1B1_0min"].prediction,
            1 - exp_data_filtered[exp_data_filtered.condition == "1A2_0min"].prediction,
            1
            - exp_data_filtered[exp_data_filtered.condition == "1B1_120min"].prediction,
            1
            - exp_data_filtered[exp_data_filtered.condition == "1A2_120min"].prediction,
        ]
        stds = []
        for dataset in ddsets:
            print(len(dataset))
            tmp = []
            for i in range(10000):
                tmp.append(
                    dataset.iloc[
                        np.random.choice(
                            range(len(dataset)), size=int(len(dataset)*0.05), replace=True
                        )
                    ].mean()
                )
            stds.append(np.std(tmp))
        time["std"] = stds

        means = time.pivot(index="type", columns="condition", values="time").T
        errors = time.pivot(index="type", columns="condition", values="std").T
        fig = means.plot(kind="bar", yerr=errors)
        plt.ylabel("Fraction of time spent in looped state")
        plt.title(f"Best case {realistic_best}")
        pdf.savefig(fig.figure, bbox_inches="tight")
        time.drop("std", axis=1, inplace=True)
        
        
    time_average = pd.concat([time, time_average])



In [None]:

print(f"\n\n {realistic_list}")
print("\n\n  AVERAGE")
print(time_average.groupby(["condition", "type"]).time.mean())
print("\n\n Standard deviation")
print(time_average.groupby(["condition", "type"]).time.std())
ax = sns.barplot(
    data=time_average,
    hue="type",
    x="condition",
    y="time",
    ci="sd",
    capsize=0.1,
    estimator=np.mean,
)
plt.ylim(0, 1)
sns.scatterplot(data=time_average, 
                hue="type",
                x="condition",
                y="time",
                legend=False,
                zorder=10,
                ax=ax,
               )

plt.ylabel("Fraction of time spent in looped state")
plt.title(f"average from {realistic_list}")
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
pdf.savefig(fig.figure, bbox_inches="tight")

# Effect of CTCF in best case

In [None]:
time_average = pd.DataFrame()
for realistic_select in realistic_list:

    subset_on = alldata[
        ["ctcfon.rad21on_" + realistic_select in x for x in alldata.condition]
    ].copy()
    subset_off = alldata[
        ["ctcfoff.rad21on_" + realistic_select in x for x in alldata.condition]
    ].copy()
    time = pd.DataFrame(
        [
            (1 - subset_on.prediction.mean())
            / (1 - subset_off.prediction.mean()),
            ft_yesCTCF_yescohesin / ft_noCTCF_yescohesin
        ],
        columns=["time"],
    )

    time["type"] = ["sim HMM", "exp"]
    if realistic_select == realistic_best:
        print(f"{realistic_best}")
        print(time)
        ax = sns.barplot(data=time, x="type", y="time")
        sns.scatterplot(data=time, legend=False, zorder=10, ax=ax)
        plt.ylabel("Foldchange +ctcf/-ctcf in the presence of cohesin")
        plt.title(f"Best case {realistic_best}")
        pdf.savefig(ax.figure)
    time_average = pd.concat([time, time_average])

plt.figure()

print(f"\n\n{realistic_list}")
print("\n\n AVERAGE")
print(time_average.groupby(["type"]).time.mean())
print("\n\n Standard deviation")
print(time_average.groupby(["type"]).time.std())

ax = sns.barplot(data=time_average, x="type", y="time", ci="sd", capsize=.1, estimator = np.mean)
sns.scatterplot(data=time_average, x="type", y="time", legend=False, zorder=10, ax=ax)
plt.ylabel("Foldchange +ctcf/-ctcf in the presence of cohesin")
plt.title(f"average from {realistic_list}")
pdf.savefig(ax.figure)

# contact duration and second passage time of best case

In [None]:
nocohesin = distances_control.copy()
nocohesin.condition = nocohesin.uniqueid.map(lambda x: str(x)[:-2])
(
    durations_nocohesin,
    second_passage_times_nocohesin,
    frequencies_nocohesin,
    fraction_time_nocohesin,
    conditions_nocohesin,
    data_realistic_nocohesin,
) = calculate_duration_second_passage_time(
    nocohesin, resolution=30, model=results[realistic_model_best]
)

(
    durations_nocohesin_gt,
    second_passage_times_nocohesin_gt,
    frequencies_nocohesin_gt,
    fraction_time_nocohesin_gt,
    conditions_nocohesin_gt,
    data_realistic_gt,
) = calculate_duration_second_passage_time(
    nocohesin, resolution=30, model=results[realistic_model_best], gt=True
)

cond = realistic_model_best
distances_selected = distances[distances.condition == cond].copy()
distances_selected.condition = distances_selected.uniqueid.map(lambda x: str(x)[:-2])
(
    durations,
    second_passage_times,
    frequencies,
    fraction_time,
    conditions,
    data_realistic,
) = calculate_duration_second_passage_time(
    distances_selected, resolution=30, model=results[realistic_model_best]
)

(
    durations_gt,
    second_passage_times_gt,
    frequencies_gt,
    fraction_time_gt,
    conditions_gt,
    data_realistic_gt,
) = calculate_duration_second_passage_time(
    distances_selected, resolution=30, model=results[realistic_model_best], gt=True
)

In [None]:
# Contact duration
durations["ctcf"] = durations.condition.str.split(".", expand=True)[0]
durations["ctcf"] = durations["ctcf"] + "_cohesinon"
durations_gt["ctcf"] = durations_gt.condition.str.split(".", expand=True)[0]
durations_gt["ctcf"] = durations_gt["ctcf"] + "_cohesinon_gt"
durations_nocohesin["ctcf"] = durations_nocohesin.condition.str.split(".", expand=True)[
    0
]
durations_nocohesin["ctcf"] = durations_nocohesin["ctcf"] + "_nocohesin"
durations_nocohesin_gt["ctcf"] = durations_nocohesin_gt.condition.str.split(
    ".", expand=True
)[0]
durations_nocohesin_gt["ctcf"] = durations_nocohesin_gt["ctcf"] + "_nocohesin_gt"
exp_durations["ctcf"] = exp_durations["condition"]

durations = pd.concat(
    [
        durations,
        durations_gt,
        exp_durations,
        durations_nocohesin,
        durations_nocohesin_gt,
    ]
)

print(f"{realistic_best}")
print("\n\n AVERAGE")
print(durations.groupby(["ctcf"]).contact_duration.mean())
print("\n\n Standard deviation")
print(durations.groupby(["ctcf"]).contact_duration.std())

fig = plt.figure()
box_plot = sns.barplot(data=durations, x="ctcf", y="contact_duration")

ax = box_plot.axes
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
# plt.yscale('log')
plt.xlabel("sample")
plt.ylabel("Contact duration (1/koff)")
plt.title(realistic_best)
plt.show()
pdf.savefig(box_plot.figure, bbox_inches="tight")

second_passage_times["ctcf"] = second_passage_times.condition.str.split(
    ".", expand=True
)[0]
second_passage_times["ctcf"]  = second_passage_times["ctcf"]  + "_cohesinon"
second_passage_times_gt["ctcf"] = second_passage_times_gt.condition.str.split(
    ".", expand=True
)[0]
second_passage_times_gt["ctcf"] = second_passage_times_gt["ctcf"] + "_cohesinon_gt"
second_passage_times_nocohesin[
    "ctcf"
] = second_passage_times_nocohesin.condition.str.split(".", expand=True)[0]
second_passage_times_nocohesin["ctcf"] = (
    second_passage_times_nocohesin["ctcf"] + "_nocohesin"
)
second_passage_times_nocohesin_gt[
    "ctcf"
] = second_passage_times_nocohesin_gt.condition.str.split(".", expand=True)[0]
second_passage_times_nocohesin_gt["ctcf"] = (
    second_passage_times_nocohesin_gt["ctcf"] + "_nocohesin_gt"
)
exp_second_passage_times["ctcf"] = exp_second_passage_times["condition"]

second_passage_times = pd.concat(
    [
        second_passage_times,
        second_passage_times_gt,
        exp_second_passage_times,
        second_passage_times_nocohesin_gt,
        second_passage_times_nocohesin,
    ]
)

print(f"\n\n  {realistic_best}")
print("\n\n  AVERAGE")
print(second_passage_times.groupby(["ctcf"]).second_passage_time.mean())
print("\n\n  Standard deviation")
print(second_passage_times.groupby(["ctcf"]).second_passage_time.std())

fig = plt.figure()
box_plot = sns.barplot(
    data=second_passage_times,
    x="ctcf",
    y="second_passage_time",
    estimator=lambda x: 1 / np.mean(x),
)

ax = box_plot.axes
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.xlabel("sample")
plt.ylabel("1/second_passage_time (kon)")
plt.title(realistic_best)
plt.show()
pdf.savefig(box_plot.figure, bbox_inches="tight")

### P values

In [None]:
list_conditions = durations.ctcf.unique()
for i in range(len(list_conditions)):
    for j in range(i + 1, len(list_conditions)):
        test = scipy.stats.ttest_ind(
            durations.contact_duration[durations.ctcf == list_conditions[i]].values,
            durations.contact_duration[durations.ctcf == list_conditions[j]].values,
        )
        print(f"{list_conditions[i]} vs {list_conditions[j]} p-value pfrom two sided t.test {test.pvalue}")

In [None]:
list_conditions = second_passage_times.ctcf.unique()
for i in range(len(list_conditions)):
    for j in range(i + 1, len(list_conditions)):
        test = scipy.stats.ttest_ind(
            second_passage_times.second_passage_time[second_passage_times.ctcf == list_conditions[i]].values,
            second_passage_times.second_passage_time[second_passage_times.ctcf == list_conditions[j]].values,
        )
        print(f"{list_conditions[i]} vs {list_conditions[j]} p-value pfrom two sided t.test {test.pvalue}")

In [None]:
# average change in second passage time
sptimes = second_passage_times.groupby("ctcf").mean()["second_passage_time"]
changes = [
    sptimes.loc["1A2_0min"] / sptimes.loc["1B1_0min"],
    sptimes.loc["ctcfoff_cohesinon"] / sptimes.loc["ctcfon_cohesinon"],
    sptimes.loc["1A2_120min"] / sptimes.loc["1A2_0min"],
    sptimes.loc["ctcfoff_nocohesin"] / sptimes.loc["ctcfoff_cohesinon"],
    sptimes.loc["1A2_120min"] / sptimes.loc["1B1_0min"],
    sptimes.loc["ctcfoff_nocohesin"] / sptimes.loc["ctcfon_cohesinon"],
];
conditions = [
    "+cohesin +/-CTCF",
    "+cohesin +/-ctcf sim",
    "-CTCF +/-cohesin",
    "-CTCF +/-cohesin sim",
    "-CTCF-cohesin/+CTCF+cohesin",
    "-CTCF-cohesin/+CTCF+cohesin sim",
];


for x,y in zip(changes, conditions):
    print(x,y)

fig, ax = plt.subplots(1,2, figsize=(10,5))

df = pd.DataFrame(changes, columns=["Change in 1/second_passage_time"])
df["condition"] = conditions
box_plot = sns.barplot(data=df, x="condition", y="Change in 1/second_passage_time", ax = ax[0])
ax1 = box_plot.axes
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=90)
ax1.set_title(f"{realistic_best} linear scale")


df = pd.DataFrame(np.log10(changes), columns=["Change in 1/second_passage_time"])
df["condition"] = conditions
box_plot = sns.barplot(data=df, x="condition", y="Change in 1/second_passage_time", ax = ax[1])
ax1 = box_plot.axes
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=90)
ax1.set_title(f"{realistic_best} log scale")
pdf.savefig(box_plot.figure, bbox_inches='tight')

In [None]:
# average change in durations
cdtimes = durations.groupby("ctcf").mean()["contact_duration"]
changes = [
     cdtimes.loc["1B1_0min"] / cdtimes.loc["1A2_0min"],
    cdtimes.loc["ctcfon_cohesinon"] / cdtimes.loc["ctcfoff_cohesinon"],
    cdtimes.loc["1A2_0min"] / cdtimes.loc["1A2_120min"],
    cdtimes.loc["ctcfoff_cohesinon"] / cdtimes.loc["ctcfoff_nocohesin"],
    cdtimes.loc["1B1_0min"] / cdtimes.loc["1A2_120min"],
    cdtimes.loc["ctcfon_cohesinon"] / cdtimes.loc["ctcfoff_nocohesin"],
];
conditions = [
    "+cohesin +/-CTCF",
    "+cohesin +/-ctcf sim",
    "-CTCF +/-cohesin",
    "-CTCF +/-cohesin sim",
    "-CTCF-cohesin/+CTCF+cohesin",
    "-CTCF-cohesin/+CTCF+cohesin sim",
];

for x,y in zip(changes, conditions):
    print(x,y)

fig, ax = plt.subplots(1,2, figsize=(10,5))

df = pd.DataFrame(changes, columns=["Change in contact duration"])
df["condition"] = conditions
box_plot = sns.barplot(data=df, x="condition", y="Change in contact duration", ax = ax[0])
ax1 = box_plot.axes
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=90)
ax1.set_title(f"{realistic_best} linear scale")


df = pd.DataFrame(np.log10(changes), columns=["Change in contact duration"])
df["condition"] = conditions
box_plot = sns.barplot(data=df, x="condition", y="Change in contact duration", ax = ax[1])
ax1 = box_plot.axes
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=90)
ax1.set_title(f"{realistic_best} log scale")
pdf.savefig(box_plot.figure, bbox_inches='tight')

In [None]:
# simulation
durations_all = pd.DataFrame()
durations_all_gt = pd.DataFrame()
second_passage_times_all = pd.DataFrame()
second_passage_times_all_gt = pd.DataFrame()
fraction_of_time_all = pd.DataFrame()
fraction_of_time_all_gt = pd.DataFrame()

for cond in realistic_model_list:
    distances_selected = (
        distances[distances.condition == cond].iloc[::subsample_every].copy()
    )

    naked = distances_control.copy()

    distances_selected.condition = distances_selected.uniqueid
    (
        durations,
        second_passage_times,
        frequencies,
        fraction_time,
        conditions,
        data_realistic,
    ) = calculate_duration_second_passage_time(
        distances_selected, resolution=30, model=results[cond]
    )

    durations_all = pd.concat([durations_all, durations])
    second_passage_times_all = pd.concat(
        [second_passage_times, second_passage_times_all]
    )

    (
        durations_gt,
        second_passage_times_gt,
        frequencies_gt,
        fraction_time_gt,
        conditions_gt,
        data_realistic_gt,
    ) = calculate_duration_second_passage_time(
        distances_selected, resolution=30, model=results[cond], gt=True
    )
    durations_all_gt = pd.concat([durations_all_gt, durations_gt])
    second_passage_times_all_gt = pd.concat(
        [second_passage_times_all_gt, second_passage_times_gt]
    )

# Contact duration
durations_all["ctcf"] = durations_all.condition.str.split(".", expand=True)[0]
durations_all["ctcf"] = durations_all["ctcf"] + "_cohesinon"
durations_all_gt["ctcf"] = durations_all_gt.condition.str.split(".", expand=True)[0]
durations_all_gt["ctcf"] = durations_all_gt["ctcf"] + "_cohesinon_gt"
exp_durations["ctcf"] = exp_durations["condition"]

durations_all = pd.concat(
    [
        durations_all,
        durations_all_gt,
        exp_durations,
        durations_nocohesin,
        durations_nocohesin_gt,
    ]
)


print(f"\n\n{realistic_model_list}")
print("\n\n AVERAGE")
print(durations_all.groupby(["ctcf"]).contact_duration.mean())
print("\n\n Standard deviation")
print(durations_all.groupby(["ctcf"]).contact_duration.std())


In [None]:
tmp_df_cd = durations_all.copy()
tmp_df_cd.induction_time = tmp_df_cd.condition.map(lambda x: str(x)[:-2])+tmp_df_cd.ctcf
tmp_df_cd['cont_dur_aver'] = tmp_df_cd['contact_duration'].groupby(tmp_df_cd['induction_time']).transform('mean')
tmp_df_cd['cont_dur_std'] = tmp_df_cd['cont_dur_aver'].groupby(tmp_df_cd['ctcf']).transform('std')

In [None]:
box_plot = sns.barplot(data=tmp_df_cd, x="ctcf", y="cont_dur_aver", ci='sd')
ax = box_plot.axes
sns.scatterplot(
    data=tmp_df_cd,
    x="ctcf",
    y="cont_dur_aver",
    #join=False,
    ax=ax,
    legend=False,
    zorder=10)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.xlabel("sample")
plt.ylabel("Contact duration (1/koff)")
plt.title(list(realistic_list))
pdf.savefig(box_plot.figure, bbox_inches="tight")

In [None]:
second_passage_times_all["ctcf"] = second_passage_times_all.condition.str.split(
    ".", expand=True
)[0]
second_passage_times_all["ctcf"] = second_passage_times_all["ctcf"] + "_cohesinon"
second_passage_times_all_gt["ctcf"] = second_passage_times_all_gt.condition.str.split(
    ".", expand=True
)[0]
second_passage_times_all_gt["ctcf"] = (
    second_passage_times_all_gt["ctcf"] + "_cohesinon_gt"
)
exp_second_passage_times["ctcf"] = exp_second_passage_times["condition"]

second_passage_times = pd.concat(
    [
        second_passage_times_all,
        second_passage_times_all_gt,
        exp_second_passage_times,
        second_passage_times_nocohesin_gt,
        second_passage_times_nocohesin,
    ]
)

In [None]:
tmp_df_spt = second_passage_times.copy()
tmp_df_spt.induction_time = tmp_df_spt.condition.map(lambda x: str(x)[:-2])+tmp_df_spt.ctcf
tmp_df_spt['spt_aver'] = 1/tmp_df_spt['second_passage_time'].groupby(tmp_df_spt['induction_time']).transform('mean')
tmp_df_spt['spt_std'] = tmp_df_spt['spt_aver'].groupby(tmp_df_spt['ctcf']).transform('std')

In [None]:
box_plot = sns.barplot(data=tmp_df_spt, x="ctcf", y="spt_aver", ci='sd')

ax = box_plot.axes
sns.scatterplot(
    data=tmp_df_spt,
    x="ctcf",
    y="spt_aver",
    ax=ax,
    legend=False,
    zorder=10)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.xlabel("sample")
plt.ylabel("1/second_passage_time (kon)")
plt.title(list(realistic_list))
pdf.savefig(box_plot.figure, bbox_inches="tight")

# distance distribution of best  case and comparison with experimental data

In [None]:
best_subset = alldata[["rad21on_" + realistic_best in x for x in alldata.condition]].copy()
best_subset_on = best_subset[best_subset.ctcf=="on"].copy()
best_subset_off = best_subset[best_subset.ctcf=="off"].copy()
dist = exp_data_filtered[(exp_data_filtered.condition == "1B1_0min")].copy()

distance = dist.distance.values.reshape(-1, 1)
states = (exp_model.predict(distance))
dist['states'] = states
dist["states_numeric"] = dist.states.values
dist.states.replace(0, "looped", inplace=True)
dist.states.replace(1, "unlooped", inplace=True)

hists = []
hist = plt.hist(best_subset_on.distance[best_subset_on.prediction==0], density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

hist=plt.hist(best_subset_on.distance[best_subset_on.prediction==1], density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

hist=plt.hist(best_subset_on.distance, density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

hist=plt.hist(dist.distance[dist.states == "looped"], density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)
plt.close()

fig = plt.figure()
for hist in hists:
    plt.plot( hist[1][:-1], hist[0]/np.sum(hist[0]))
plt.xlim(0,1)
plt.xlabel("Distance (nm)")
plt.ylabel("Fraction of distances")
plt.legend(["Sim looped", "Sim noloop", "all_sim", "exp 1B1_0min looped"])
plt.title(f"best case {realistic_best}")
pdf.savefig(fig)

In [None]:
hists = []
hist = plt.hist(best_subset_on.distance[best_subset_on.prediction==0], density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

hist=plt.hist(dist.distance[dist.states=="looped"], density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

plt.close()

fig = plt.figure()
for hist in hists:
    plt.plot( hist[1][:-1], hist[0]/np.sum(hist[0]))
plt.xlim(0,1)
plt.xlabel("Distance (nm)")
plt.ylabel("Fraction of distances")
plt.legend(["Sim looped", "exp looped"])
plt.title(realistic_best)
pdf.savefig(fig)

In [None]:
conditions = realistic_list#dist_together.sort_values('dist_gaussian_means').head(5).realistic

hists = []
for cond in conditions:
    subset_on = alldata[["rad21on_" + cond in x for x in alldata.condition]].copy()
    hist = plt.hist(subset_on.distance[subset_on.prediction==0], alpha=0.3, bins=np.arange(0,5,0.01))
    hists.append(hist)


hist=plt.hist(dist.distance[dist.states=="looped"], alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)
conditions = np.append(conditions , "exp looped")
plt.close()

fig = plt.figure()
for hist in hists[:-1]:
    plt.plot( hist[1][:-1], hist[0]/np.sum(hist[0]))
    
plt.plot(hists[-1][1][:-1], hists[-1][0]/np.sum(hists[-1][0]), color='black', linewidth=6)
plt.xlim(0,1)
plt.xlabel("Distance (nm)")
plt.ylabel("Fraction of distances")
plt.legend(conditions)
plt.title("looped state")
pdf.savefig(fig)

In [None]:
hists = []
hist=plt.hist(best_subset_on.distance, density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

hist=plt.hist(best_subset_off.distance, density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

hist=plt.hist(distances_control.distance, density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

hist=plt.hist(exp.distance[exp.condition == "1B1_0min"], density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

hist=plt.hist(exp.distance[exp.condition == "1B1_120min"], density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

hist=plt.hist(exp.distance[exp.condition == "1A2_0min"], density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

hist=plt.hist(exp.distance[exp.condition == "1A2_120min"], density=True, alpha=0.3, bins=np.arange(0,5,0.01))
hists.append(hist)

plt.close()

fig = plt.figure()
for hist in hists:
    plt.plot( hist[1][:-1], hist[0]/np.sum(hist[0]))
plt.xlim(0,1)
plt.xlabel("Distance (nm)")
plt.ylabel("Fraction of distances")
plt.legend(["CTCF on", "CTCF off", "naked", "1B1_0min", "1B1_120min", "1A2_0min", "1A2_120min"])
plt.title(realistic_best)
pdf.savefig(fig)

In [None]:
lst_conds = ["1B1_0min", "1B1_120min", "1A2_0min", "1A2_120min"]
print("condition", '\t', 'mean', '\t','\t', '\t','variance (square of std)')
print("-------------------------------------------------")
for cond in lst_conds:
    print(cond,'\t',
          np.mean(exp.distance[exp.condition == cond]), '\t',
          np.std(exp.distance[exp.condition == cond])**2)

# Example tracks with GT and prediction

In [None]:
# +/- ctcf tracks
for ctcf, sub in best_subset.groupby("ctcf"):
    for limitstart in range(250,12000, 250):
        limit = limitstart + 250
        fig = plt.figure()
        plt.plot(sub.frame.values[limitstart:limit]*30, sub.distance.values[limitstart:limit])
        plt.plot(sub.frame.values[limitstart:limit]*30, sub.bond.values[limitstart:limit], alpha=0.8)
        plt.plot(sub.frame.values[limitstart:limit]*30, sub.prediction.values[limitstart:limit], alpha=0.5)
        plt.legend(['track', 'gt', 'pred'])
        plt.xlabel("time (s)")
        plt.ylabel("Distance (um)")
        plt.title("ctcf_" + ctcf + "_" + realistic_best)
        plt.show()
        plt.ylim(-0.2,1.2)
        pdf.savefig(fig)
        plt.close()

In [None]:
# naked polymer

traj = distances_control.distance.values.reshape(-1, 1)
m = results[realistic_model_best]
distances_control["prediction"] = m.predict(traj)

sub = distances_control.copy()
for ctcf, sub in distances_control.groupby("ctcf"):
    for limitstart in range(250,12000, 250):
        limit = limitstart + 250
        fig = plt.figure()

        plt.plot(sub.frame.values[limitstart:limit]*30, sub.distance.values[limitstart:limit])
        plt.plot(sub.frame.values[limitstart:limit]*30, sub.bond.values[limitstart:limit], alpha=0.8)
        plt.plot(sub.frame.values[limitstart:limit]*30, sub.prediction.values[limitstart:limit], alpha=0.5)
        plt.legend(['track', 'gt', 'pred'])
        plt.xlabel("time (s)")
        plt.ylabel("Distance (um)")
        plt.title("-cohesin " + realistic_best + "ctcf" + ctcf)
        plt.ylim(-0.2,1.2)
        plt.show()
        pdf.savefig(fig)
        plt.close()

In [None]:
for limitstart in range(250,12000, 250):
    limit = limitstart + 250
    fig = plt.figure()
    
    plt.plot(distances.frame.values[limitstart:limit]*30, distances.distance.values[limitstart:limit], alpha=0.8)
    plt.plot(distances.frame.values[limitstart:limit]*30, distances_nonoise.distance.values[limitstart:limit], alpha=0.8)
    plt.legend(['+noise', '-noise'])
    plt.xlabel("time (s)")
    plt.ylabel("Distance (um)")
    plt.ylim(-0.2,1.2)
    plt.show()
    pdf.savefig(fig)
    plt.close()

In [None]:
pdf.close()