## Customize file required for Propagate



In [1]:
import os
import glob
import pandas as pd
import re
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
os.chdir("/fs/scratch/PAS0439/Ming/virome_ecology_core_prkaryotes/results/04_prophage_rumen_mags")

In [3]:
df = pd.read_csv("rumen_mags_prophage_summary.csv")

*Propagate input fasta definition line cannot have pipe symbol so modifiy original Vibrant output*

In [4]:
category3 = []
with open("prophage_rumen_mags_category3.txt", 'r') as file:
    lines = file.readlines()
    for f in lines:
        category3.append(f.strip())

In [5]:
checkv_prophage = glob.glob("checkv/**/proviruses.fna", recursive = True)

In [6]:
prophage_checkv = {}
for f in checkv_prophage:
    records = SeqIO.parse(f, "fasta")
    for record in records:
        range = {}
        if record.name in category3:
            prophage = record.name
            range["start"] = record.description.split(' ')[1].split('/')[0].split('-')[0]
            range["end"] = record.description.split(' ')[1].split('/')[0].split('-')[1]
            prophage_checkv[prophage] = range

In [7]:
propagate_df = pd.DataFrame.from_dict(prophage_checkv, orient = "index").reset_index()

In [8]:
for index, row in propagate_df.iterrows():
    prophage = row["index"]
    scaffold = prophage.split('|')[0]
    propagate_df.loc[index, "scaffold"] = scaffold

In [9]:
propagate_df.rename(columns = {"index":"fragment", "end":"stop"}, inplace = True)

In [10]:
propagate_df = propagate_df[["scaffold", "fragment", "start", "stop"]]

In [11]:
host_genome = pd.read_csv("rumen_mags_prophage_summary.csv")

In [12]:
host_df = pd.merge(propagate_df, host_genome, left_on = "fragment", right_on = "prophage", how = "left").drop("prophage", axis = 1)

In [13]:
host_genomes = ['../../../databases/rumen_mags_high_quality/dereplicated_genomes_with_taxid/' + f + '.fasta' for f in host_df.host_genome]

In [14]:
with open("rumen_mags_prophage_scaffolds_for_propogate.fasta", 'w') as outfile:
    for genome in host_genomes:
        _ = genome.split('/')[-1].split('.fasta')[0]
        records = SeqIO.parse(genome, "fasta")
        seq = list(host_df.query('host_genome == @_')['scaffold'])[0]     
        for record in records:
            if record.description.split(" ")[1] == seq:
                newrecord = SeqRecord(
                Seq(record.seq),
                id=seq,
                description = "")
                
                SeqIO.write(newrecord, outfile, "fasta")
                break
    

In [15]:
propagate_df.to_csv("prophage_scaffold_coordinates_for_propogate.tsv", sep = "\t", index = None)