# Ground truth 1s and 30s data

In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
%autoreload 2

In [None]:
from hmmlearn import hmm
from matplotlib.backends.backend_pdf import PdfPages
from utils import *
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import glob
import pathlib
import zipfile
import gdown
import os
import matplotlib
import matplotlib.colors as mcolors
matplotlib.rcParams['pdf.fonttype'] = 42

In [None]:
reproducible = True
distance_conversion = (0.012*np.sqrt(40)) # 1 a.u. = 0.08 um
rolling = 1
if reproducible:
    np.random.seed(42)
subsample = False

if subsample:
    number_subsample = 200000

In [None]:
dataset_name = "langevin_release_on-off_220125_1s"
basedir = f"./{dataset_name}/"
#create a folder
pathlib.Path(basedir).mkdir(parents=True, exist_ok=True)
# polymer simulations, single loop dataset
sample_link = "https://drive.google.com/uc?export=download&id=1nBcuQOslr-yaq4_Xsapu2safUpx6CQV7"
# download data if not already present
if not os.path.isfile(basedir+f"{dataset_name}.zip"):
    gdown.download(sample_link, basedir+f"{dataset_name}.zip")
#unzip the data
with zipfile.ZipFile(basedir+f"{dataset_name}.zip", 'r') as zip_ref:
    zip_ref.extractall(basedir)
list_files = glob.glob(f"{basedir}/*/*/*/*/*/dst_bnd.dat")

filenames = pd.DataFrame(list_files, columns=["filename"])
filenames[["ctcf", "speed", "loading", "unloading", "rep"]] = filenames["filename"].str.extract(
    r"langevin_release_on-off_220125_1s\/([\w+]*)\/([\d+]*)\/([\d]\.[\d+]*)\/([\d]\.[\d+]*)\/([\d+]*)\/",
    expand=True,
)
filenames["condition"] = filenames["speed"] + "." + filenames["loading"] + "_" + filenames["unloading"]
filenames["uniqueid"] = (
    "ctcf"
    + filenames["ctcf"]
    + "."
    + "rad21"
    + "on"
    + "_"
    + filenames["speed"]
    + "."
    + filenames["loading"]
    + "."
    + filenames["unloading"]
    + "."
    + filenames["rep"]
)
top_conditions = ['17500.0.05_0.01', '17500.0.1_0.02', '17500.0.01_0.005', '17500.0.2_0.02', '17500.0.02_0.01']
filenames = filenames[filenames['condition'].isin(top_conditions)]

In [None]:
alldf = []
for condition, sub2 in filenames.groupby('condition'):
    for uniqueid, sub1 in sub2.groupby('uniqueid'):
        for ct, sub in sub1.groupby('ctcf'):
            for file in sub.filename.unique():
                tmp = pd.read_csv(file, sep=" ")
                tmp.distance = tmp.distance * distance_conversion
                try:
                    tmp.drop(["x1", "y1", "z1", "x2", "y2", "z2"], axis =1, inplace=True)
                except:
                    pass
                tmp['condition'] = condition
                tmp['frame'] = np.arange(len(tmp))
                tmp['uniqueid'] = uniqueid 
                tmp['ctcf'] = ct
                tmp['rad21'] = "on"
                if subsample:
                    tmp = tmp.head(number_subsample)
                alldf.append(tmp)
distances_original = pd.concat(alldf)

In [None]:
nneighbor = 2
subsample_every = 1


idx = np.where(
    np.array([abs(eval(x)) for x in distances_original.columns[1:][:-5]])
    < 18 - nneighbor
)[0]
names_remove = distances_original.columns[idx + 1]

# subsampling and remove too far neighbors 
distances = distances_original.iloc[::subsample_every].copy().drop(names_remove, axis=1)

# calculate the GT ctcf mediated loop
idx = (
    np.where(
        np.array([abs(eval(x)) for x in distances.columns[1:][:-5]]) >= 18 - nneighbor
    )[0]
    + 1
)
distances["bond"] = distances.iloc[:, idx].sum(axis=1)
distances.loc[distances.bond > 1, "bond"] = 1

In [None]:
distances.drop(['491-509', '491-508', '491-507', '492-509', '492-508', '493-509'], axis=1, inplace=True)

In [None]:
best = distances[distances.condition=='17500.0.05_0.01'].copy()
best5 = distances[(distances.condition=='17500.0.05_0.01')|(distances.condition=='17500.0.1_0.02')|(distances.condition=='17500.0.01_0.005')|(distances.condition=='17500.0.2_0.02')|(distances.condition=='17500.0.02_0.01')].copy()
for i in ['duration', 'spt', 'ft', 'rate']:
    best5[i]=np.nan
    best[i]=np.nan

In [None]:
pdf = PdfPages('gt_only.pdf')
dict_durations = {}
dict_spt = {}
fig = plt.figure()
for ctcf in best.ctcf.unique():
    
    best_loops, best_gaps = calc_loops_gaps_total_len_gt(best[best.ctcf==ctcf], rev=True)
    dict_durations[best.condition.unique()[0]] = best_loops
    dict_spt[best.condition.unique()[0]] = best_gaps
    plt.hist(best_loops, alpha=0.5)
    plt.title("Distribution of contact durations, 1s")
    best.loc[best.ctcf==ctcf, 'duration'] = np.mean(best_loops)
    best.loc[best.ctcf==ctcf, 'spt'] = np.mean(1/np.array(best_gaps))
    best.loc[best.ctcf==ctcf, 'ft'] = np.sum(best_loops) / (np.sum(best_loops) + np.sum(best_gaps))
    best.loc[best.ctcf==ctcf, 'rate'] = 'best'
for ctcf in best5.ctcf.unique():
    for cond in best5.condition.unique():
        loops, gaps = calc_loops_gaps_total_len_gt(best5[(best5.ctcf==ctcf)&(best5.condition==cond)], rev=True)
        best5.loc[(best5.ctcf==ctcf)&(best5.condition==cond), "duration"] = np.mean(loops)
        best5.loc[(best5.ctcf==ctcf)&(best5.condition==cond), "spt"] = np.mean(1/np.array(gaps))
        best5.loc[(best5.ctcf==ctcf)&(best5.condition==cond), "ft"] = np.sum(loops) / (np.sum(loops) + np.sum(gaps))
best5 = best5.assign(rate = 'best_five')
entire_dataset=pd.concat([best, best5])
entire_dataset['mix']=entire_dataset["ctcf"] +"_"+ entire_dataset["rate"]+"_"+ entire_dataset["condition"]
pdf.savefig(fig)

In [None]:
fig = plt.figure()
ax = sns.barplot(
    data=entire_dataset,
    hue="ctcf",
    x="rate",
    y="duration",
    ci="sd",
    capsize=0.1,
)
sns.scatterplot(
    data=entire_dataset.groupby("mix").max(),
    hue="ctcf",
    x="rate",
    y="duration",
    ax=ax,
    legend=False,
    zorder=10)
for container in ax.containers:
    ax.bar_label(container, label_type='center', color='gray', fontsize=14)
plt.title('contact duration, 1s')
pdf.savefig(fig)

In [None]:
fig = plt.figure()
ax = sns.barplot(
    data=entire_dataset,
    hue="ctcf",
    x="rate",
    y="spt",
    ci="sd",
    capsize=0.1,
)
sns.scatterplot(
    data=entire_dataset.groupby("mix").max(),
    hue="ctcf",
    x="rate",
    y="spt",
    ax=ax,
    legend=False,
    zorder=10)
for container in ax.containers:
    ax.bar_label(container, label_type='center', color='gray', fontsize=14)
plt.title('1 / second passage time, 1s')
pdf.savefig(fig)

In [None]:
fig = plt.figure()
ax = sns.barplot(
    data=entire_dataset,
    hue="ctcf",
    x="rate",
    y="ft",
    ci="sd",
    capsize=0.1,
)
sns.scatterplot(
    data=entire_dataset.groupby("mix").max(),
    hue="ctcf",
    x="rate",
    y="ft",
    ax=ax,
    legend=False,
    zorder=10)
for container in ax.containers:
    ax.bar_label(container, label_type='center', color='gray', fontsize=14)
plt.title('fraction of time in a looped state, 1s')
plt.ylim(0,1)
pdf.savefig(fig)

In [None]:
# naked polymer
for cond, sub in entire_dataset.groupby("mix"):
    print(cond)
    for limitstart in range(250,12000, 250):
        limit = limitstart + 250
        fig = plt.figure()

        plt.plot(sub.frame.values[limitstart:limit], sub.distance.values[limitstart:limit])
        plt.plot(sub.frame.values[limitstart:limit], 1-sub.bond.values[limitstart:limit], alpha=0.8)
        plt.legend(['track', 'gt'])
        plt.xlabel("time (s)")
        plt.ylabel("Distance (um)")
        plt.title(f"Condition {cond}")
        plt.ylim(-0.2,1.2)
        plt.show()
        pdf.savefig(fig)
        plt.close()

### Repeat everything but with sampling to 30s

In [None]:
nneighbor = 2
subsample_every = 30

idx = np.where(
    np.array([abs(eval(x)) for x in distances_original.columns[1:][:-5]])
    < 18 - nneighbor
)[0]
names_remove = distances_original.columns[idx + 1]

# subsampling and remove too far neighbors 
distances = distances_original.iloc[::subsample_every].copy().drop(names_remove, axis=1)

# calculate the GT ctcf mediated loop
idx = (
    np.where(
        np.array([abs(eval(x)) for x in distances.columns[1:][:-5]]) >= 18 - nneighbor
    )[0]
    + 1
)
distances["bond"] = distances.iloc[:, idx].sum(axis=1)
distances.loc[distances.bond > 1, "bond"] = 1
distances.drop(['491-509', '491-508', '491-507', '492-509', '492-508', '493-509'], axis=1, inplace=True)
best = distances[distances.condition=='17500.0.05_0.01'].copy()
best5 = distances[(distances.condition=='17500.0.05_0.01')|(distances.condition=='17500.0.1_0.02')|(distances.condition=='17500.0.01_0.005')|(distances.condition=='17500.0.2_0.02')|(distances.condition=='17500.0.02_0.01')].copy()
for i in ['duration', 'spt', 'ft', 'rate']:
    best5[i]=np.nan
    best[i]=np.nan

In [None]:
dict_durations = {}
dict_spt = {}
fig = plt.figure()
for ctcf in best.ctcf.unique():
    
    best_loops, best_gaps = calc_loops_gaps_total_len_gt(best[best.ctcf==ctcf], rev=True)
    dict_durations[best.condition.unique()[0]] = best_loops
    dict_spt[best.condition.unique()[0]] = best_gaps
    plt.hist(best_loops, alpha=0.5)
    plt.title("Distribution of contact durations, 30s")
    best.loc[best.ctcf==ctcf, 'duration'] = np.mean(best_loops)*30
    best.loc[best.ctcf==ctcf, 'spt'] =  np.mean(1/np.array(best_gaps)/30)
    best.loc[best.ctcf==ctcf, 'ft'] = np.sum(best_loops) / (np.sum(best_loops) + np.sum(best_gaps))
    best.loc[best.ctcf==ctcf, 'rate'] = 'best'
for ctcf in best5.ctcf.unique():
    for cond in best5.condition.unique():
        loops, gaps = calc_loops_gaps_total_len_gt(best5[(best5.ctcf==ctcf)&(best5.condition==cond)], rev=True)
        dict_durations[cond] = loops
        dict_spt[cond] = gaps
        best5.loc[(best5.ctcf==ctcf)&(best5.condition==cond), "duration"] = np.mean(loops)*30
        best5.loc[(best5.ctcf==ctcf)&(best5.condition==cond), "spt"] = np.mean(1/np.array(gaps)/30)
        best5.loc[(best5.ctcf==ctcf)&(best5.condition==cond), "ft"] = np.sum(loops) / (np.sum(loops) + np.sum(gaps))
best5 = best5.assign(rate = 'best_five')
entire_dataset=pd.concat([best, best5])
entire_dataset['mix']=entire_dataset["ctcf"] +"_"+ entire_dataset["rate"]+"_"+ entire_dataset["condition"]
pdf.savefig(fig)

In [None]:
fig = plt.figure()
ax = sns.barplot(
    data=entire_dataset,
    hue="ctcf",
    x="rate",
    y="duration",
    ci="sd",
    capsize=0.1,
)
sns.scatterplot(
    data=entire_dataset.groupby("mix").max(),
    hue="ctcf",
    x="rate",
    y="duration",
    ax=ax,
    legend=False,
    zorder=10)
for container in ax.containers:
    ax.bar_label(container, label_type='center', color='gray', fontsize=14)
plt.title('contact duration, 30s data')
pdf.savefig(fig)

In [None]:
fig = plt.figure()
ax = sns.barplot(
    data=entire_dataset,
    hue="ctcf",
    x="rate",
    y="spt",
    ci="sd",
    capsize=0.1,
)
sns.scatterplot(
    data=entire_dataset,
    hue="ctcf",
    x="rate",
    y="spt",
    ax=ax,
    legend=False,
    zorder=10)
for container in ax.containers:
    ax.bar_label(container, label_type='center', color='gray', fontsize=14)
plt.title('1 / second passage time, 30s data')
pdf.savefig(fig)

In [None]:
fig = plt.figure()
ax = sns.barplot(
    data=entire_dataset,
    hue="ctcf",
    x="rate",
    y="ft",
    ci="sd",
    capsize=0.1,
)
sns.scatterplot(
    data=entire_dataset.groupby("mix").max(),
    hue="ctcf",
    x="rate",
    y="ft",
    ax=ax,
    legend=False,
    zorder=10)
for container in ax.containers:
    ax.bar_label(container, label_type='center', color='gray', fontsize=14)
plt.title('fraction of time in a looped state, 30s data')
plt.ylim(0,1)
pdf.savefig(fig)

In [None]:
pdf.close()