In [2]:
import os 
import pandas as pd
import subprocess
from Bio import SeqIO

### Extract coordinate of start and end regions of T3SS1 on chromosome 1 from 128 strains

In [14]:
chr2 = [file.split(".fasta.tabular")[0] for file in os.listdir("./NCBI BLAST_vpa0450-vpa0451")]
chr2 = sorted(set(chr2))
coordinate = {}
strain_name = {}
for region in chr2:
    df = pd.read_csv(f"./NCBI BLAST_vpa0450-vpa0451/{region}.fasta.tabular", delimiter = "\t", header = None)
    df.columns = ["qaccver","saccver","pident","alignment length","mismatch number","gapopen","qstart","qend","sstart","send","evalue","alignment score", "bit score"]
    best_row = df.loc[df["bit score"].idxmax()]
    if best_row["sstart"] < best_row["send"]:
        coordinate[best_row["qaccver"]] = [best_row["qstart"], best_row["qend"]]
    else:
        coordinate[best_row["qaccver"]] = [best_row["qend"], best_row["qstart"]]
    strain_name[region] = max(df["qaccver"])

In [None]:
coordinate

### Create bed file

In [7]:
bed_file = 'regions.bed'
strain_names_in_bed = set()

with open(bed_file, 'r') as infile:
    for line in infile:
        strain_name = line.strip().split()[0]  # The first column is the strain name
        strain_names_in_bed.add(strain_name)

# Save the strain names to a file for later use
with open('strain_names_in_bed.txt', 'w') as outfile:
    for strain in strain_names_in_bed:
        outfile.write(strain + "\n")


In [8]:
with open('strain_names_in_bed.txt', 'r') as infile:
    valid_strains = set(line.strip() for line in infile)

# Filter sequences in the 128_strains.fasta file
fasta_file = '128_strains.fasta'
output_file = 'filtered_128_strains.fasta'

with open(fasta_file, 'r') as infile, open(output_file, 'w') as outfile:
    write_sequence = False
    for line in infile:
        if line.startswith(">"):  # Header line
            strain_name = line.strip().split()[0][1:]  # Remove '>' and get strain name
            write_sequence = strain_name in valid_strains  # Check if strain name is valid
        if write_sequence:  # Only write sequence if the strain name matches
            outfile.write(line)

print(f"Filtered FASTA file saved as {output_file}")

Filtered FASTA file saved as filtered_128_strains.fasta


### Find strain that has reversed T3SS1 regions, and store in reversed_strain

In [20]:
reverse_strain = {}
with open('regions.bed', 'w') as bedfile:
    for strain, coords in coordinate.items():
        start, end = coords
        if start > end:
            bedfile.write(f"{strain}\t{end}\t{start}\n")
            reverse_strain[strain] = [start, end]
        if start < end:
            bedfile.write(f"{strain}\t{start}\t{end}\n")

### Reverse T6SS2 with reversed strain

In [10]:
!bedtools getfasta -fi filtered_128_strains.fasta -fo output_directory/strain_name.fasta -bed regions.bed

In [21]:
input_fasta = "./output_directory/strain_name.fasta"
output_fasta = "./output_directory/T3SS1_chr2_normalized.fasta"
with open(input_fasta, "r") as infile, open(output_fasta, "w") as outfile:
    for record in SeqIO.parse(infile, "fasta"):
        strain = record.id 
        if strain.split(":")[0] in reverse_strain:
            start, end = reverse_strain[strain.split(":")[0]]
            record.seq = record.seq.reverse_complement()
        SeqIO.write(record, outfile, "fasta")

### Renaming from plasmid name to strain name in the iqtree file 

In [26]:
newer = {}
for strain, plasmid in strain_name.items():
    strain = strain.split("_ahpnd")[0]
    newer[strain] = plasmid
new = {}
for strain, plasmid in newer.items():
    strain = strain.split("_clinical")[0]
    new[strain] = plasmid

In [27]:
plasmid_to_strain = {v: k for k, v in new.items()}
print(plasmid_to_strain)

{'CP045795.1': '10329', 'CP097356.1': '16-VB00198', 'CP033142.1': '160807', 'CP062154.1': '17-VB00214', 'CP046412.1': '19-021-D1', 'CP062151.1': '19-VB00998', 'CP065370.1': '20-082A3', 'CP083362.1': '20-082E4', 'CP046827.1': '2010V-1106', 'CP020035.1': '20130629002S01', 'CP046809.1': '2013V-1146', 'CP046788.1': '2013V-1174', 'CP046784.1': '2013V-1181', 'CP046781.1': '2013V-1244', 'CP034286.1': '20140624012-1', 'CP034290.1': '20140722001-1', 'CP047991.1': '20140723005', 'CP034295.1': '20140829008-1', 'CP046780.1': '2014V-1066', 'CP046777.1': '2014V-1125', 'CP047996.1': '20150710009', 'CP034306.1': '20151116002-3', 'CP046753.1': '2015AW-0174', 'CP034299.1': '20160303005-1', 'CP128371.1': '23EBVib0155', 'CP074416.1': '64', 'CP176358.1': 'AG1', 'CP046762.1': 'AM46865', 'CP192148.1': 'ANHC.C2L', 'CP014047.2': 'ATCC17802', 'CP003973.1': 'BB220P', 'CP099940.1': 'BM17A_1', 'CP129925.1': 'BM17A_2', 'CP129933.1': 'BM18B', 'CP099938.1': 'BM19BA_1', 'CP129939.1': 'BM21A_2', 'CP099942.1': 'BM23A_1'

In [28]:
plasmid_to_strain

{'CP045795.1': '10329',
 'CP097356.1': '16-VB00198',
 'CP033142.1': '160807',
 'CP062154.1': '17-VB00214',
 'CP046412.1': '19-021-D1',
 'CP062151.1': '19-VB00998',
 'CP065370.1': '20-082A3',
 'CP083362.1': '20-082E4',
 'CP046827.1': '2010V-1106',
 'CP020035.1': '20130629002S01',
 'CP046809.1': '2013V-1146',
 'CP046788.1': '2013V-1174',
 'CP046784.1': '2013V-1181',
 'CP046781.1': '2013V-1244',
 'CP034286.1': '20140624012-1',
 'CP034290.1': '20140722001-1',
 'CP047991.1': '20140723005',
 'CP034295.1': '20140829008-1',
 'CP046780.1': '2014V-1066',
 'CP046777.1': '2014V-1125',
 'CP047996.1': '20150710009',
 'CP034306.1': '20151116002-3',
 'CP046753.1': '2015AW-0174',
 'CP034299.1': '20160303005-1',
 'CP128371.1': '23EBVib0155',
 'CP074416.1': '64',
 'CP176358.1': 'AG1',
 'CP046762.1': 'AM46865',
 'CP192148.1': 'ANHC.C2L',
 'CP014047.2': 'ATCC17802',
 'CP003973.1': 'BB220P',
 'CP099940.1': 'BM17A_1',
 'CP129925.1': 'BM17A_2',
 'CP129933.1': 'BM18B',
 'CP099938.1': 'BM19BA_1',
 'CP129939.1':

In [23]:
reverse_strain

{'CP062154.1': [1501023, 1499040],
 'CP062151.1': [1487933, 1485950],
 'CP065370.1': [486830, 484847],
 'CP046809.1': [1571593, 1569610],
 'CP046784.1': [469941, 467958],
 'CP047991.1': [1089063, 1087080],
 'CP046777.1': [561915, 559932],
 'CP034306.1': [363482, 361499],
 'CP046753.1': [1756114, 1754131],
 'CP034299.1': [181500, 179517],
 'CP074416.1': [1469675, 1467692],
 'CP176358.1': [504827, 502844],
 'CP192148.1': [711779, 709796],
 'CP014047.2': [1005979, 1003996],
 'CP129925.1': [1311680, 1309698],
 'CP129933.1': [1427708, 1425725],
 'CP099942.1': [1570862, 1568880],
 'CP129931.1': [1355586, 1353604],
 'CP099922.1': [238402, 236420],
 'cluster_002_consensus': [1534314, 1532331],
 'CP010884.1': [1393624, 1391641],
 'CP034566.1': [192345, 190362],
 'CP066247.1': [379575, 378082],
 'CP064034.1': [535208, 533225],
 'CP073069.1': [618591, 616608],
 'CP020428.2': [247333, 245350],
 'CP026042.1': [1001641, 999658],
 'CP044070.1': [963609, 961626],
 'CP044063.1': [533308, 531325],
 'CP0