In [1]:
import os 
import pandas as pd
from Bio import SeqIO

### Extract coordinate of start and end regions of T3SS1 on chromosome 1 from 128 strains

In [11]:
vp1656 = [file.split(".fasta.tabular")[0] for file in os.listdir("./NCBI BLAST_vp1656_300/")]
vp1656 = sorted(set(vp1656))
start_list = {}
strain_name = {}
for start_coor in vp1656:
    df = pd.read_csv(f"./NCBI BLAST_vp1656_300/{start_coor}.fasta.tabular", delimiter = "\t", header = None)
    df.columns = ["qaccver","saccver","pident","alignment length","mismatch number","gapopen","qstart","qend","sstart","send","evalue","alignment score", "bit score"]
    start_list[max(df["qaccver"])] = max(df["qstart"])
    strain_name[start_coor] = max(df["qaccver"])

In [3]:
vp1702 = [file.split(".fasta.tabular")[0] for file in os.listdir("./NCBI BLAST_300_vp1702/")]
vp1702 = sorted(set(vp1702))
end_list = {}
for start_coor in vp1702:
    df = pd.read_csv(f"./NCBI BLAST_300_vp1702/{start_coor}.fasta.tabular", delimiter = "\t", header = None)
    df.columns = ["qaccver","saccver","pident","alignment length","mismatch number","gapopen","qstart","qend","sstart","send","evalue","alignment score", "bit score"]
    end_list[max(df["qaccver"])] = max(df["qend"])

In [4]:
coordinate = {}
for strain in start_list:
    if strain in end_list:
        coordinate[strain] = [start_list[strain], end_list[strain]]

### Create bed file

In [5]:
new_coordinate = {k: v[1] - v[0] for k, v in coordinate.items()}

In [7]:
bed_file = 'regions.bed'
strain_names_in_bed = set()

with open(bed_file, 'r') as infile:
    for line in infile:
        strain_name = line.strip().split()[0]  # The first column is the strain name
        strain_names_in_bed.add(strain_name)

# Save the strain names to a file for later use
with open('strain_names_in_bed.txt', 'w') as outfile:
    for strain in strain_names_in_bed:
        outfile.write(strain + "\n")


In [8]:
with open('strain_names_in_bed.txt', 'r') as infile:
    valid_strains = set(line.strip() for line in infile)

# Filter sequences in the 128_strains.fasta file
fasta_file = '128_strains.fasta'
output_file = 'filtered_128_strains.fasta'

with open(fasta_file, 'r') as infile, open(output_file, 'w') as outfile:
    write_sequence = False
    for line in infile:
        if line.startswith(">"):  # Header line
            strain_name = line.strip().split()[0][1:]  # Remove '>' and get strain name
            write_sequence = strain_name in valid_strains  # Check if strain name is valid
        if write_sequence:  # Only write sequence if the strain name matches
            outfile.write(line)

print(f"Filtered FASTA file saved as {output_file}")

Filtered FASTA file saved as filtered_128_strains.fasta


### Find strain that has reversed T3SS1 regions, and store in reversed_strain

In [9]:
reversed_strain = {}
with open('coordinate.bed', 'w') as bedfile:
    for strain, coords in coordinate.items():
        start, end = coords
        # BED format requires start < end, so swap if needed
        if start > end:
            bedfile.write(f"{strain}\t{end}\t{start}\n")
            reversed_strain[strain] = [start, end]
        if start < end:
            bedfile.write(f"{strain}\t{start}\t{end}\n")

In [12]:
newer = {}
for strain, plasmid in strain_name.items():
    strain = strain.split("_ahpnd")[0]
    newer[strain] = plasmid
new = {}
for strain, plasmid in newer.items():
    strain = strain.split("_clinical")[0]
    new[strain] = plasmid

In [31]:
plasmid_to_strain = {v: k for k, v in new.items()}
plasmid_to_strain

{'CP045794.1': '10329',
 'CP097355.1': '16-VB00198',
 'CP033141.1': '160807',
 'CP062153.1': '17-VB00214',
 'CP046411.1': '19-021-D1',
 'CP062150.1': '19-VB00998',
 'CP065369.1': '20-082A3',
 'CP083361.1': '20-082E4',
 'CP046828.1': '2010V-1106',
 'CP020034.1': '20130629002S01',
 'CP046808.1': '2013V-1146',
 'CP046787.1': '2013V-1174',
 'CP046783.1': '2013V-1181',
 'CP046782.1': '2013V-1244',
 'CP034285.1': '20140624012-1',
 'CP034289.1': '20140722001-1',
 'CP047990.1': '20140723005',
 'CP034294.1': '20140829008-1',
 'CP046779.1': '2014V-1066',
 'CP046778.1': '2014V-1125',
 'CP047995.1': '20150710009',
 'CP034305.1': '20151116002-3',
 'CP046754.1': '2015AW-0174',
 'CP034298.1': '20160303005-1',
 'CP128370.1': '23EBVib0155',
 'CP074415.1': '64',
 'CP176357.1': 'AG1',
 'CP046761.1': 'AM46865',
 'CP192147.1': 'ANHC.C2L',
 'CP014046.2': 'ATCC17802',
 'CP003972.1': 'BB220P',
 'CP099939.1': 'BM17A_1',
 'CP129924.1': 'BM17A_2',
 'CP129932.1': 'BM18B',
 'CP099937.1': 'BM19BA_1',
 'CP129938.1':

### Reverse T3SS1 with reversed strain

In [14]:
!bedtools getfasta -fi filtered_128_strains.fasta -fo output_directory/strain_name.fasta -bed coordinate.bed



In [15]:
input_fasta = "./output_directory/strain_name.fasta"
output_fasta = "./output_directory/T3SS1_normalized.fasta"
with open(input_fasta, "r") as infile, open(output_fasta, "w") as outfile:
    for record in SeqIO.parse(infile, "fasta"):
        strain = record.id 
        if strain.split(":")[0] in reverse_strain:
            start, end = reverse_strain[strain.split(":")[0]]
            # Extract the relevant region and reverse if needed
            record.seq = record.seq.reverse_complement()
        # Write the modified record to the output file
        SeqIO.write(record, outfile, "fasta")

In [16]:
len(reverse_strain)

67

### Rename plasmid to strain name in iqtree file

In [18]:
import re
with open("./[PMinh]T3SS1.nhx", "r") as f:
    tree_string = f.read()
def replace_label(match):
    label = match.group(0)
    prefix = label.split("_")[0]
    return plasmid_to_strain.get(prefix, label)
new_tree = re.sub(r'([A-Z]{2}\d+\.\d+_\d+-\d+)', replace_label, tree_string)
with open("./MAFFT_T3SS1.fasta.treefile", "w") as f:
    f.write(new_tree)

In [29]:
list_coord = {}
for strain, coordinate in reverse_strain.items():
    start, end = reverse_strain[strain]
    delta = start-end
    list_coord[strain] = delta

In [30]:
list_coord

{'CP045794.1': 35862,
 'CP097355.1': 32945,
 'CP033141.1': 35964,
 'CP062153.1': 38503,
 'CP046411.1': 74230,
 'CP062150.1': 38503,
 'CP065369.1': 74230,
 'CP046828.1': 35861,
 'CP020034.1': 35965,
 'CP046808.1': 35861,
 'CP034285.1': 35964,
 'CP034289.1': 35964,
 'CP034294.1': 35964,
 'CP046778.1': 35862,
 'CP034305.1': 35963,
 'CP046754.1': 35861,
 'CP176357.1': 35965,
 'CP046761.1': 35966,
 'CP014046.2': 38490,
 'CP099937.1': 35632,
 'CP099941.1': 35822,
 'CP099928.1': 35860,
 'CP099921.1': 35860,
 'CP099934.1': 35949,
 'CP063525.1': 35976,
 'CP006008.1': 35962,
 'CP034565.1': 38327,
 'CP064035.1': 35831,
 'CP064033.1': 35960,
 'CP187429.1': 35964,
 'CP073068.1': 35976,
 'CP020427.2': 35959,
 'CP026041.1': 35862,
 'CP044071.1': 35861,
 'CP006004.1': 32574,
 'CP009765.1': 36338,
 'CP009982.1': 35965,
 'CP013826.1': 35965,
 'CP012950.1': 35962,
 'CP125856.1': 35964,
 'CP097873.1': 35963,
 'CP099919.1': 35799,
 'CP068627.1': 35959,
 'CP040100.1': 36328,
 'CP040101.1': 35966,
 'CP133891