In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/3.3 MB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m1.7/3.3 MB[0m [31m25.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [3]:
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

fasta_path = "/content/drive/MyDrive/genome.fna"
output_dir = "chunks"
chunk_size = 20_000_000
overlap = 30                   # перекрытие между чанками (в нуклеотидах), чтобы не терять Z-днк на концах

os.makedirs(output_dir, exist_ok=True)

def split_fasta_with_overlap():
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq = record.seq
        total_len = len(seq)
        step = chunk_size - overlap
        for i in range(0, total_len, step):
            chunk_seq = seq[i:i + chunk_size]
            chunk_id = f"{record.id}_chunk_{i//step + 1}"
            chunk_record = SeqRecord(chunk_seq, id=chunk_id, description="")
            chunk_path = os.path.join(output_dir, f"{chunk_id}.fasta")
            SeqIO.write(chunk_record, chunk_path, "fasta")
            print(f"Сохранили {chunk_path} ({len(chunk_seq)} bp)")

split_fasta_with_overlap()

Сохранили chunks/NC_081552.1_chunk_1.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_2.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_3.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_4.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_5.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_6.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_7.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_8.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_9.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_10.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_11.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_12.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_13.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_14.fasta (11965019 bp)
Сохранили chunks/NC_081553.1_chunk_1.fasta (20000000 bp)
Сохранили chunks/NC_081553.1_chunk_2.fasta (20000000 bp)
Сохранили chunks/NC_081553.1_chunk_3.fasta (20000000 bp)
Сохранили chunks/NC_081553

In [4]:
import os

chunks_dir = "chunks"
zhunt_output_dir = "drive/MyDrive/zhunt_output"

# Получаем список всех чанков, которые ты генерировала
all_chunks = [f for f in os.listdir(chunks_dir) if f.endswith(".fasta")]
all_chunk_ids = {os.path.splitext(f)[0] for f in all_chunks}

# Получаем список всех файлов zhunt, которые реально отработали
zhunt_results = [f for f in os.listdir(zhunt_output_dir) if f.endswith(".Z-SCORE")]
zhunt_chunk_ids = {os.path.splitext(f)[0] for f in zhunt_results}

# Считаем
total_chunks = len(all_chunk_ids)
processed_chunks = len(zhunt_chunk_ids)
coverage_fraction = round(processed_chunks / total_chunks, 3)

print(f"Обработано чанков: {processed_chunks} из {total_chunks}")
print(f"Покрытие по zhunt: {coverage_fraction * 100}%")

Обработано чанков: 230 из 347
Покрытие по zhunt: 66.3%


In [1]:
import os
import pandas as pd
from tqdm import tqdm

zhunt_dir = "/content/drive/MyDrive/zhunt_output"
output_csv = "zdna_filtered.csv"

pd.DataFrame(columns=["chunk", "start", "end", "score"]).to_csv(output_csv, index=False)

for file in tqdm(os.listdir(zhunt_dir)):
    if file.endswith(".Z-SCORE"):
        chunk_id = file.replace(".Z-SCORE", "")
        file_path = os.path.join(zhunt_dir, file)

        with open(file_path, 'r') as f:
            lines = f.readlines()
            if len(lines) <= 1:
                continue

        try:

            df = pd.read_csv(
                file_path,
                sep='\s+',
                skiprows=1,
                header=None,
                usecols=[0, 1, 2, 5],
                names=["start", "end", "length", "score"]
            )
            df = df[df["score"] > 400]
            df["chunk"] = chunk_id

            df[["chunk", "start", "end", "score"]].to_csv(
                output_csv,
                mode='a',
                header=False,
                index=False
            )
        except Exception as e:
            print(f"Ошибка в {file}: {e}")

print("Готово! Результаты в zdna_filtered.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["chunk"] = chunk_id  # Добавляем имя чанка
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["chunk"] = chunk_id  # Добавляем имя чанка
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["chunk"] = chunk_id  # Добавляем имя чанка
A value is trying to be set on a copy of a slice from a DataFrame.


Готово! Результаты в zdna_filtered.csv





In [19]:
import pandas as pd

zdna = pd.read_csv("zdna_filtered.csv")  # тот csv, который у тебя получился
zdna.columns = ["seqid", "start", "end", "score"]

gff = pd.read_csv("genomic.gff", sep="\t", comment='#', header=None)
gff.columns = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]

In [20]:
zdna

Unnamed: 0,seqid,start,end,score
0,NW_026739343.1_chunk_1,7598,7620,476.5783
1,NW_026739343.1_chunk_1,7600,7620,569.3077
2,NW_026739343.1_chunk_1,7602,7620,670.6803
3,NW_026739343.1_chunk_1,7604,7620,801.4424
4,NW_026739343.1_chunk_1,7606,7622,495.8953
...,...,...,...,...
633577,NW_026739213.1_chunk_1,32995,33011,828.6824
633578,NW_026739213.1_chunk_1,32996,33012,3308.6160
633579,NW_026739213.1_chunk_1,32997,33013,731.2843
633580,NW_026739213.1_chunk_1,42833,42849,559.9668


In [21]:
gff

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,NC_081552.1,RefSeq,region,1,271964629,.,+,.,ID=NC_081552.1:1..271964629;Dbxref=taxon:35570...
1,NC_081552.1,Gnomon,gene,966,22098,.,+,.,ID=gene-LOC131997755;Dbxref=GeneID:131997755;N...
2,NC_081552.1,Gnomon,lnc_RNA,966,22098,.,+,.,ID=rna-XR_009398415.1;Parent=gene-LOC131997755...
3,NC_081552.1,Gnomon,exon,966,1124,.,+,.,ID=exon-XR_009398415.1-1;Parent=rna-XR_0093984...
4,NC_081552.1,Gnomon,exon,1185,3100,.,+,.,ID=exon-XR_009398415.1-2;Parent=rna-XR_0093984...
...,...,...,...,...,...,...,...,...,...
378417,NW_026739412.1,RefSeq,region,1,29435,.,+,.,ID=NW_026739412.1:1..29435;Dbxref=taxon:35570;...
378418,NW_026739413.1,RefSeq,region,1,29751,.,+,.,ID=NW_026739413.1:1..29751;Dbxref=taxon:35570;...
378419,NW_026739414.1,RefSeq,region,1,38434,.,+,.,ID=NW_026739414.1:1..38434;Dbxref=taxon:35570;...
378420,NW_026739415.1,RefSeq,region,1,26270,.,+,.,ID=NW_026739415.1:1..26270;Dbxref=taxon:35570;...


In [22]:
def annotate_feature(df, features, feature_name):
    annotations = []
    for i, row in df.iterrows():
        matched = features[
            (features["seqid"] == row["seqid"]) &
            (features["start"] <= row["end"]) &
            (features["end"] >= row["start"])
        ]
        if not matched.empty:
            annotations.append(feature_name)
        else:
            annotations.append(None)
    return annotations

In [23]:
zdna["Exon"] = annotate_feature(zdna, gff[gff["type"] == "exon"], "exon")
zdna["Intron"] = annotate_feature(zdna, gff[gff["type"] == "intron"], "intron")
zdna["Gene"] = annotate_feature(zdna, gff[gff["type"] == "gene"], "gene")

genes = gff[gff["type"] == "gene"].copy()
promoters = genes.copy()
promoters["start"], promoters["end"] = promoters["start"] - 1000, promoters["start"]
zdna["Promoter"] = annotate_feature(zdna, promoters, "promoter")

downstream = genes.copy()
downstream["start"], downstream["end"] = downstream["end"], downstream["end"] + 200
zdna["Downstream"] = annotate_feature(zdna, downstream, "downstream")


In [None]:
import pandas as pd

zdna['chrom'] = zdna['chunk'].str.split('_chunk_').str[0]

def intersects(row1, row2):
    if row1['chrom'] != row2['seqid']:
        return False
    return not (row1['end'] < row2['start'] or row1['start'] > row2['end'])

def annotate_feature(zdna_df, gff_df, feature_type):
    features = gff_df[gff_df['type'] == feature_type]
    hits = []
    for _, zdna_row in zdna_df.iterrows():
        hit = features.apply(lambda gff_row: intersects(zdna_row, gff_row), axis=1).any()
        hits.append(hit)
    return hits

genes = gff[gff['type'] == 'gene'].copy()

promoters = genes.copy()
promoters['start'] = promoters['start'] - 1000
promoters['end'] = promoters['start'] + 1000
promoters['type'] = 'promoter'

downstream = genes.copy()
downstream['start'] = downstream['end']
downstream['end'] = downstream['end'] + 200
downstream['type'] = 'downstream'

zdna['Exons'] = annotate_feature(zdna, gff, 'exon')
zdna['Introns'] = annotate_feature(zdna, gff, 'intron')
zdna['Genes'] = annotate_feature(zdna, gff, 'gene')
zdna['Promoters (1000 up from TSS)'] = annotate_feature(zdna, promoters, 'promoter')
zdna['Downstream (200 bp)'] = annotate_feature(zdna, downstream, 'downstream')

total_regions = {
    'Exons': len(gff[gff['type'] == 'exon']),
    'Introns': len(gff[gff['type'] == 'intron']),
    'Promoters (1000 up from TSS)': len(promoters),
    'Downstream (200 bp)': len(downstream),
    'Genes': len(gff[gff['type'] == 'gene']),
}

def count_fraction(col):
    c = zdna[col].sum()
    f = c / len(zdna)
    return c, round(f, 3)

stats = {}
for region in ['Exons', 'Introns', 'Promoters (1000 up from TSS)', 'Downstream (200 bp)', 'Genes']:
    c, f = count_fraction(region)
    stats[region] = {
        'Числоучастков с квадруплексом': c,
        'Доля участков с предсказанным квадруплексом': f,
        'Число участков предсказаний Zhun': total_regions.get(region, 0)
    }

zdna['Intergenic'] = ~(zdna[['Exons', 'Introns', 'Genes']].any(axis=1))
c = zdna['Intergenic'].sum()
f = c / len(zdna)
total_intergenic = 'N/A'

stats['Intergenic'] = {
    'Числоучастков с квадруплексом': c,
    'Доля участков с предсказанным квадруплексом': round(f, 3),
    'Число участков предсказаний Zhun': total_intergenic
}

result_table = pd.DataFrame(stats).T

print(result_table)
