In [None]:
from pathlib import Path

from multi_padlock_design.backbone_design import hybprobe

# Generates sequences that can be used as hybridisation binding sites for new padlocks

This small pipeline first generates many random sequences, with a set of common primer binding constraints taken into account such as GC content, melting temperature, sequence length.
Then we filter these random sequences down to a list of final candidate sequences which do not have any off-target binding anywhere in the mouse transcriptome above a certain melting temperature cutoff.

In [None]:
params = {
    # sequence constraints
    "n_sequences": 2000,  # how many random candidates to generate
    "min_len": 20,
    "max_len": 30,
    "gc_min": 0.40,
    "gc_max": 0.60,
    "require_gc_clamps": True,  # GC at both ends
    # nearest-neighbor Tm filter (readblast.calc_tm_NN)
    "tm_min": 50.0,
    "tm_max": 70.0,
    # BLAST filtering
    "blast_tm_cutoff": 21.0,  # reject if any BLAST hit has cseq-based Tm > cutoff
    # IO / compute
    "work_dir": "/nemo/lab/znamenskiyp/scratch/probe_queries",
    "job_name_tag": "rand_probes",
    "sbatch": True,
    "conda_env": "iss-preprocess",
    "seed": 42,
}

In [None]:
# 1) Generate random candidate probes
hybprobe.set_seed(params.get("seed"))
work_dir = Path(params["work_dir"]).expanduser().resolve()
work_dir.mkdir(parents=True, exist_ok=True)
gene_tag = hybprobe.make_job_tag(params["job_name_tag"])
df_candidates = hybprobe.generate_candidates(
    n=params["n_sequences"],
    min_len=params["min_len"],
    max_len=params["max_len"],
    gc_min=params["gc_min"],
    gc_max=params["gc_max"],
    require_gc_clamps=params["require_gc_clamps"],
    tm_min=params["tm_min"],
    tm_max=params["tm_max"],
)
print(f"Generated {len(df_candidates)} candidates -> {gene_tag}")
df_candidates.head()

In [None]:
# 2) Write CSV and submit parallel BLAST jobs
csv_path = hybprobe.write_probe_csv_for_blast(df_candidates, work_dir, gene_tag)
print("Wrote:", csv_path)
_ = hybprobe.submit_parallel_blast(csv_path, work_dir, sbatch=params["sbatch"])
outs = hybprobe.wait_for_blast_outputs(work_dir)
len(outs)

In [None]:
# 3) Parse BLAST outputs and compute Tm (cseq) for hits
parsed = hybprobe.parse_blast_outputs(work_dir)
parsed = hybprobe.compute_tm_hit_cseq(parsed)
parsed.head()

# Final probes with BLAST results

In [None]:
# 4) Summarize keep/reject and export results
df_out, out_table = hybprobe.summarize_candidates(
    df_candidates, parsed, params["blast_tm_cutoff"], work_dir, gene_tag
)
print("Wrote:", out_table)
df_out.sort_values(["kept", "max_blast_tm_cseq"], ascending=[False, True]).head(20)