In [3]:
import os 
import pandas as pd
import subprocess
from Bio import SeqIO

In [4]:
vpa1025 = [file.split(".fasta.tabular")[0] for file in os.listdir("./NCBI BLAST_vpa1025_300/")]
vpa1025 = sorted(set(vpa1025))
start_list = {}
strain_name = {}
for start_coor in vpa1025:
    df = pd.read_csv(f"./NCBI BLAST_vpa1025_300/{start_coor}.fasta.tabular", delimiter = "\t", header = None)
    df.columns = ["qaccver","saccver","pident","alignment length","mismatch number","gapopen","qstart","qend","sstart","send","evalue","alignment score", "bit score"]
    start_list[max(df["qaccver"])] = max(df["qstart"])
    strain_name[start_coor] = max(df["qaccver"])

In [12]:
vpa1046 = [file.split(".fasta.tabular")[0] for file in os.listdir("./NCBI BLAST_300_vpa1046/")]
vpa1046 = sorted(set(vpa1025))
end_list = {}
for start_coor in vpa1025:
    df = pd.read_csv(f"./NCBI BLAST_300_vpa1046/{start_coor}.fasta.tabular", delimiter = "\t", header = None)
    df.columns = ["qaccver","saccver","pident","alignment length","mismatch number","gapopen","qstart","qend","sstart","send","evalue","alignment score", "bit score"]
    end_list[max(df["qaccver"])] = max(df["qend"])

In [13]:
coordinate = {}
for strain in start_list:
    if strain in end_list:
        coordinate[strain] = [start_list[strain], end_list[strain]]

In [None]:
coordinate

In [14]:
reverse_strain = {}
with open('regions.bed', 'w') as bedfile:
    for strain, coords in coordinate.items():
        start, end = coords
        if start > end:
            bedfile.write(f"{strain}\t{end}\t{start}\n")
            reverse_strain[strain] = [start, end]
        if start < end:
            bedfile.write(f"{strain}\t{start}\t{end}\n")

In [56]:
bed_file = 'regions.bed'
strain_names_in_bed = set()

with open(bed_file, 'r') as infile:
    for line in infile:
        strain_name = line.strip().split()[0]  # The first column is the strain name
        strain_names_in_bed.add(strain_name)

# Save the strain names to a file for later use
with open('strain_names_in_bed.txt', 'w') as outfile:
    for strain in strain_names_in_bed:
        outfile.write(strain + "\n")


In [57]:
with open('strain_names_in_bed.txt', 'r') as infile:
    valid_strains = set(line.strip() for line in infile)

# Filter sequences in the 128_strains.fasta file
fasta_file = '128_strains.fasta'
output_file = 'filtered_128_strains.fasta'

with open(fasta_file, 'r') as infile, open(output_file, 'w') as outfile:
    write_sequence = False
    for line in infile:
        if line.startswith(">"):  # Header line
            strain_name = line.strip().split()[0][1:]  # Remove '>' and get strain name
            write_sequence = strain_name in valid_strains  # Check if strain name is valid
        if write_sequence:  # Only write sequence if the strain name matches
            outfile.write(line)

print(f"Filtered FASTA file saved as {output_file}")

Filtered FASTA file saved as filtered_128_strains.fasta


### Renaming from plasmid name to strain name in the iqtree file 

In [None]:
newer = {}
for strain, plasmid in strain_name.items():
    strain = strain.split("_ahpnd")[0]
    newer[strain] = plasmid
new = {}
for strain, plasmid in newer.items():
    strain = strain.split("_clinical")[0]
    new[strain] = plasmid

In [23]:
plasmid_to_strain = {v: k for k, v in new.items()}
print(plasmid_to_strain)

{'CP045795.1': '10329', 'CP097356.1': '16-VB00198', 'CP033142.1': '160807', 'CP062154.1': '17-VB00214', 'CP046412.1': '19-021-D1', 'CP062151.1': '19-VB00998', 'CP065370.1': '20-082A3', 'CP083362.1': '20-082E4', 'CP046827.1': '2010V-1106', 'CP020035.1': '20130629002S01', 'CP046809.1': '2013V-1146', 'CP046788.1': '2013V-1174', 'CP046784.1': '2013V-1181', 'CP046781.1': '2013V-1244', 'CP034286.1': '20140624012-1', 'CP034290.1': '20140722001-1', 'CP047991.1': '20140723005', 'CP034295.1': '20140829008-1', 'CP046780.1': '2014V-1066', 'CP046777.1': '2014V-1125', 'CP047996.1': '20150710009', 'CP034306.1': '20151116002-3', 'CP046753.1': '2015AW-0174', 'CP034299.1': '20160303005-1', 'CP128371.1': '23EBVib0155', 'CP074416.1': '64', 'CP176358.1': 'AG1', 'CP046762.1': 'AM46865', 'CP192148.1': 'ANHC.C2L', 'CP014047.2': 'ATCC17802', 'CP003973.1': 'BB220P', 'CP099940.1': 'BM17A_1', 'CP129925.1': 'BM17A_2', 'CP129933.1': 'BM18B', 'CP099938.1': 'BM19BA_1', 'CP129939.1': 'BM21A_2', 'CP099942.1': 'BM23A_1'

In [26]:
import re
with open("./[PMinh]T6SS2_auto.nhx", "r") as f:
    tree_string = f.read()
def replace_label(match):
    label = match.group(0)
    prefix = label.split("_")[0]
    return plasmid_to_strain.get(prefix, label)
new_tree = re.sub(r'([A-Z]{2}\d+\.\d+_\d+-\d+)', replace_label, tree_string)
with open("./T6SS2_auto_hi.nhx", "w") as f:
    f.write(new_tree)

### Itol annotation

In [7]:
# vpa1025 = [file.split("_ahpnd")[0] for file in vpa1025]
vpa1025 = [file.split("_clinical")[0] for file in vpa1025]
vpa1025

['10329',
 '16-VB00198',
 '160807',
 '17-VB00214',
 '19-021-D1',
 '19-VB00998',
 '20-082A3',
 '20-082E4',
 '2010V-1106',
 '20130629002S01',
 '2013V-1146',
 '2013V-1174',
 '2013V-1181',
 '2013V-1244',
 '20140624012-1',
 '20140722001-1',
 '20140723005',
 '20140829008-1',
 '2014V-1066',
 '2014V-1125',
 '20150710009',
 '20151116002-3',
 '2015AW-0174',
 '20160303005-1',
 '23EBVib0155',
 '64',
 'AG1',
 'AM46865',
 'ANHC.C2L',
 'ATCC17802',
 'BB220P',
 'BM17A_1',
 'BM17A_2',
 'BM18B',
 'BM19BA_1',
 'BM21A_2',
 'BM23A_1',
 'BM24A',
 'BM24B',
 'BM25',
 'BM26A_1',
 'BM26A_2',
 'BT85-28',
 'BTXS2',
 'CDC_K4557',
 'CHN25',
 'D3112',
 'DHO76',
 'DLM1799',
 'DLM1805',
 'DX190406',
 'DX210401',
 'DX230702',
 'EB101',
 'FB-11',
 'FDAARGOS_191',
 'FDAARGOS_51',
 'FDAARGOS_662',
 'FDAARGOS_667',
 'FDA_R31',
 'FORC_004',
 'FORC_006',
 'FORC_008',
 'FORC_014',
 'FORC_018',
 'FORC_022',
 'FORC_023',
 'G855',
 'GL-601',
 'GTVSS-031',
 'GTVSS-032',
 'HZ-52',
 'HZ',
 'I13A',
 'I14B',
 'I24A',
 'Isc14B',
 'Isc

In [8]:
df = pd.DataFrame(vpa1025)

In [12]:
# df = df.set_index(0)
df

10329
16-VB00198
160807
17-VB00214
19-021-D1
...
vp-HL-202005
vp-HL-202006
vp-HL-202008
vp-HL-202012
vp-pir-201806


In [73]:
meta = pd.read_csv("./metadata.csv", index_col="Strain")
meta

Unnamed: 0_level_0,Strain type,year
Strain,Unnamed: 1_level_1,Unnamed: 2_level_1
RIMD_2210633,Clinical,2004
BB220P,Environment,2012
FDA_R31,Environment,2013
CDC_K4557,Clinical,2013
UCM-V493,Environment,2014
...,...,...
DX210401,AHPND,2025
DX230702,AHPND,2025
DX190406,Environment,2025
ANHC.C2L,Environment,2025


In [56]:
"BT85-28" in meta.index

True

In [21]:
meta["year"] = meta["Isolation date"].str.split("-").str[1]
meta

Unnamed: 0_level_0,Isolation date,Strain type,year
Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RIMD 2210633,May-2004,Clinical,2004
BB220P,December-2012,Environment,2012
FDA_R31,July-2013,Environment,2013
CDC_K4557,July-2013,Clinical,2013
UCM-V493,February-2014,Environment,2014
...,...,...,...
AG1,March-2025,Environment,2025
DX210401,May-2025,AHPND,2025
DX230702,May-2025,AHPND,2025
DX190406,May-2025,Environment,2025


In [52]:
"BT85-28" in meta.index

False

In [74]:
color_strip = meta["Strain type"]
color_strip

Strain
RIMD_2210633       Clinical
BB220P          Environment
FDA_R31         Environment
CDC_K4557          Clinical
UCM-V493        Environment
                   ...     
DX210401              AHPND
DX230702              AHPND
DX190406        Environment
ANHC.C2L        Environment
BT85-28         Environment
Name: Strain type, Length: 128, dtype: object

In [75]:
color_strip.to_csv("color_strip_meta.csv", sep=",")

In [76]:
strain_type = pd.read_csv("./color_strip_meta.csv", delimiter = ",", index_col="Strain")
strain_type

Unnamed: 0_level_0,Strain type
Strain,Unnamed: 1_level_1
RIMD_2210633,Clinical
BB220P,Environment
FDA_R31,Environment
CDC_K4557,Clinical
UCM-V493,Environment
...,...
DX210401,AHPND
DX230702,AHPND
DX190406,Environment
ANHC.C2L,Environment


In [77]:
color_map = {
    "Clinical": "#66c2a5",
    "Environment": "#fc8d62",
    "AHPND": "#8da0cb"
}

# Tạo cột mới "Color" dựa trên cột "Strain type"
strain_type["Color"] = strain_type["Strain type"].map(color_map)
strain_type

Unnamed: 0_level_0,Strain type,Color
Strain,Unnamed: 1_level_1,Unnamed: 2_level_1
RIMD_2210633,Clinical,#66c2a5
BB220P,Environment,#fc8d62
FDA_R31,Environment,#fc8d62
CDC_K4557,Clinical,#66c2a5
UCM-V493,Environment,#fc8d62
...,...,...
DX210401,AHPND,#8da0cb
DX230702,AHPND,#8da0cb
DX190406,Environment,#fc8d62
ANHC.C2L,Environment,#fc8d62


In [61]:
"BT85-28" in strain_type.index

True

In [62]:
del strain_type["Strain type"]

In [82]:
strain_type.to_csv("strain_type.csv", sep=",")

In [65]:
year = meta["year"]

In [66]:
year.to_csv("strain_year.csv")

In [70]:
year = pd.read_csv("./strain_year.csv", index_col="Strain")
year

Unnamed: 0_level_0,year
Strain,Unnamed: 1_level_1
RIMD_2210633,2004
BB220P,2012
FDA_R31,2013
CDC_K4557,2013
UCM-V493,2014
...,...
DX210401,2025
DX230702,2025
DX190406,2025
ANHC.C2L,2025


### Reverse T6SS2 with reversed strain

In [None]:
!bedtools getfasta -fi filtered_128_strains.fasta -fo output_directory/strain_name.fasta -bed coordinate.bed

In [19]:
input_fasta = "./output_directory/strain_name.fasta"
output_fasta = "./output_directory/strain_name_normalized_reverse_complement.fasta"
with open(input_fasta, "r") as infile, open(output_fasta, "w") as outfile:
    for record in SeqIO.parse(infile, "fasta"):
        strain = record.id 
        if strain.split(":")[0] in reverse_strain:
            start, end = reverse_strain[strain.split(":")[0]]
            record.seq = record.seq.reverse_complement()
        SeqIO.write(record, outfile, "fasta")

In [17]:
reverse_strain

{'CP097356.1': [1511229, 1484436],
 'CP062154.1': [870039, 843247],
 'CP062151.1': [870038, 843246],
 'CP065370.1': [1650328, 1623541],
 'CP046809.1': [949957, 923170],
 'CP047991.1': [475323, 448536],
 'CP046777.1': [1795278, 1768491],
 'CP034306.1': [1500312, 1473526],
 'CP046753.1': [1154887, 1128100],
 'CP034299.1': [1359913, 1333126],
 'CP128371.1': [1276776, 1249985],
 'CP074416.1': [836920, 810133],
 'CP176358.1': [1751489, 1724702],
 'CP014047.2': [360548, 333761],
 'CP099940.1': [627079, 600293],
 'CP099929.1': [217422, 190637],
 'CP099922.1': [1369976, 1343191],
 'CP064036.1': [1166606, 1139819],
 'CP064034.1': [1671413, 1644626],
 'CP073069.1': [1762777, 1735989],
 'CP020428.2': [1489646, 1462859],
 'CP026042.1': [400408, 373621],
 'CP044070.1': [362375, 335588],
 'CP125857.1': [1018998, 992211],
 'CP176647.1': [750372, 723585],
 'CP047986.1': [1309747, 1281897],
 'CP130652.1': [1254091, 1227304],
 'CP099947.1': [867006, 840219],
 'CP176036.1': [674067, 647280],
 'CP068623.1