In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

fasta_path = "/content/drive/MyDrive/genome.fna"
output_dir = "chunks"
chunk_size = 20_000_000
overlap = 30                   # перекрытие между чанками (в нуклеотидах), чтобы не терять Z-днк на концах

os.makedirs(output_dir, exist_ok=True)

def split_fasta_with_overlap():
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq = record.seq
        total_len = len(seq)
        step = chunk_size - overlap
        for i in range(0, total_len, step):
            chunk_seq = seq[i:i + chunk_size]
            chunk_id = f"{record.id}_chunk_{i//step + 1}"
            chunk_record = SeqRecord(chunk_seq, id=chunk_id, description="")
            chunk_path = os.path.join(output_dir, f"{chunk_id}.fasta")
            SeqIO.write(chunk_record, chunk_path, "fasta")
            print(f"Сохранили {chunk_path} ({len(chunk_seq)} bp)")

split_fasta_with_overlap()

Сохранили chunks/NC_081552.1_chunk_1.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_2.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_3.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_4.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_5.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_6.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_7.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_8.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_9.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_10.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_11.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_12.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_13.fasta (20000000 bp)
Сохранили chunks/NC_081552.1_chunk_14.fasta (11965019 bp)
Сохранили chunks/NC_081553.1_chunk_1.fasta (20000000 bp)
Сохранили chunks/NC_081553.1_chunk_2.fasta (20000000 bp)
Сохранили chunks/NC_081553.1_chunk_3.fasta (20000000 bp)
Сохранили chunks/NC_081553

In [17]:
import glob
import shutil
# Папки для частей
part_dirs = ["chunks_part1", "chunks_part2", "chunks_part3"]
for d in part_dirs:
    os.makedirs(d, exist_ok=True)

# Получаем список всех чанк-файлов
all_chunks = sorted(glob.glob("chunks/*.fasta"))
print(f"Всего чанков: {len(all_chunks)}")

# Разбиваем равномерно на 3 части
for idx, chunk in enumerate(all_chunks):
    part = idx % 3  # 0, 1, 2 — по очереди
    shutil.copy(chunk, os.path.join(part_dirs[part], os.path.basename(chunk)))

Всего чанков: 347


In [18]:
!gcc zhunt3-alan.c -lm -o zhunt3

[01m[Kzhunt3-alan.c:[m[K In function ‘[01m[Kuser_regret[m[K’:
  303 |       [01;35m[Kgets[m[K( tempstr );
      |       [01;35m[K^~~~[m[K
      |       [32m[Kfgets[m[K
/usr/bin/ld: /tmp/ccW6GhVL.o: in function `user_regret':


In [19]:
%%bash
mkdir -p /content/drive/MyDrive/zhunt_output

find chunks_part1 -name "*.fasta" | \
xargs -P 4 -I{} bash -c '
    base=$(basename "{}" .fasta)
    zscore="{}.Z-SCORE"
    out="/content/drive/MyDrive/zhunt_output/${base}.Z-SCORE"
    if [ ! -f "$out" ]; then
        ./zhunt3 12 8 12 "{}"
        [ -f "$zscore" ] && mv "$zscore" "$out"
        echo "Обработан $base"
    fi
'

Process is interrupted.


In [15]:
!rm -rf "chunks"
!rm -rf "chunks_part1"
!rm -rf "chunks_part2"
!rm -rf "chunks_part3"