In [1]:
from dnallm import load_config, load_model_and_tokenizer
from dnallm import DNAInterpret, Mutagenesis

In [2]:
# Load configurations
configs = load_config("./inference_config.yaml")

In [3]:
# Load model and tokenizer
model_name = "zhangtaolab/plant-dnabert-BPE-promoter_strength_leaf"
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")

Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE-promoter_strength_leaf
00:41:18 - dnallm.models.model - INFO - Model files are stored in /home/liuguanqing/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE-promoter_strength_leaf


In [4]:
interpreter = DNAInterpret(model, tokenizer, config=configs)

In [5]:
sequence = (
    "AATATATTTAATCGGTGTATAATTTCTGTGAAGATCCTCGATACTTCATATAAGAGATTTTGAGAGAGAGAGAGA"
    "ACCAATTTTCGAATGGGTGAGTTGGCAAAGTATTCACTTTTCAGAACATAATTGGGAAACTAGTCACTTTACTAT"
    "TCAAAATTTGCAAAGTAGTC"
)
sequences = [
    "AACACTCTATTTCGGGTATTGTCTCTGTGTTCCTTTAGCGGCGGCTTTACTTTAGATTCTTCTAGGGTTTCTAGA"
    "TTGTATACCCTAGATAAGCATCCTATAAAGTAAACACAAGTACTTGCAGAGACTTTAGATTAGAGGGCTAGCGAC"
    "TGCAGAAGAAGAGTAACACG",
    "TAAAGGAACATATTCCCGTCATAAAGAAAAGTTGACTATATTTAGCCCATGCAAAAAGAAAATAGATAAATTTAG"
    "AAATCTATATGCATATATTCCTTCTCAAGGGTTATAAAAAGAGAGCACATCCATGTGAGGAATGAGGCAACACAT"
    "ATTGAGAGTAATAAAGAGTA",
    "TTCACCAGCTAGGCCATAGTGCCGGCCCTTGCACAATGTTGTATCTGATCACCTAGCTAGTGTGAAGTGTTTGGA"
    "GGAACTCTAGGTGTTATCCAGCAATGTTTCATAGTTTGTGAAACTGTAAAAGGTTTTGGTAAGACGATGATCAGA"
    "TTTGGTGTTATCATGAGTTC",
]

In [11]:
tokens, lig_scores = interpreter.interpret(sequence, method="deeplift", target=0)

In [12]:
interpreter.plot_attributions(plot_type="token")

In [13]:
interpreter.plot_attributions(plot_type="line")

In [9]:
attributions = interpreter.batch_interpret(sequences, method="deeplift", targets=[0] * len(sequences))

In [10]:
interpreter.plot_attributions()



In [None]:
# Initialize in-silico mutagenesis analyzer
mutagenesis = Mutagenesis(config=configs, model=model, tokenizer=tokenizer)

In [None]:
mutagenesis.mutate_sequence(sequence, replace_mut=True)

In [None]:
preds = mutagenesis.evaluate()

In [None]:
preds

In [None]:
import pandas as pd
base_scores = mutagenesis.process_ism_data(preds)
hotspots = mutagenesis.find_hotspots(preds, window_size=10, percentile_threshold=90.0)
_, hyp_scores, _ = mutagenesis.prepare_tfmodisco_inputs([preds])
hyp_scores = hyp_scores[0]

hotspot_motifs = {}
for start, end in hotspots:
    hotspot_id = f"hotspot_{start}-{end}"
    # Extract the slice corresponding to the hotspot from hyp_scores
    motif_matrix = hyp_scores[start:end, :]
    motif_df = pd.DataFrame(motif_matrix, columns=['A', 'C', 'G', 'T'])
    hotspot_motifs[hotspot_id] = motif_df

In [None]:
def plot_motif_logo( motif_df: pd.DataFrame, logo_type: str = 'bits', title: str = "Discovered Motif" ):
    """ Plots a sequence logo from a motif matrix using Logomaker. """
    import logomaker
    import matplotlib.pyplot as plt
    print(f"Generating '{logo_type}' logo plot for: {title}")

    if logo_type == 'bits':
        logo_df = logomaker.transform_matrix(motif_df, from_type='probability', to_type='information')
        y_label = 'Bits'
    elif logo_type == 'weights':
        logo_df = motif_df
        y_label = 'Contribution Score'
    else: raise ValueError("logo_type must be 'bits' or 'weights'")
    logo = logomaker.Logo(logo_df, font_name='Arial Rounded MT Bold')
    logo.style_spines(visible=False)
    logo.style_spines(spines=['left', 'bottom'], visible=True)
    logo.ax.set_ylabel(y_label)
    logo.ax.set_title(title)
    plt.show()

In [None]:
regions = list(hotspot_motifs.keys())
plot_motif_logo(hotspot_motifs[regions[0]], logo_type='weights', title="Hotspot Motif")

In [None]:
pmut = mutagenesis.plot(preds)
pmut