In [3]:
from Bio import SeqIO, Phylo, AlignIO

In [4]:
from Bio.Align.Applications import MafftCommandline
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import requests
import os

In [None]:
SzFaldDH = "G2G6S2"
MsACARed = "A4YGN2"

url = f"https://rest.uniprot.org/uniprotkb/{MsACARed}.fasta"

response = requests.get(url)
if response.ok:
    with open("MsACARedog.fasta", "w") as f:
        f.write(response.text)
    print("Outgroup fetched successfully!")
else:
    raise Exception("Failed to fetch outgroup sequence")


Outgroup fetched successfully!


In [20]:
with open("IPR014184.with.MsACARedog", "w") as outfile:
    for fname in ["protein-matching-IPR014184.fasta", "MsACARedog.fasta"]:
        with open(fname) as infile:
            outfile.write(infile.read())

In [21]:
mafft_cline = MafftCommandline(input="IPR014184.with.MsACARedog")
stdout, stderr = mafft_cline()
with open("IPR014184.with.MsACARed_aligned.fasta", "w") as f:
    f.write(stdout)

In [26]:
SzIPR_alignment_20_10_2025 = AlignIO.read("IPR014184.with.MsACARed_aligned.fasta", "fasta")
calculator = DistanceCalculator('blosum62')
dm = calculator.get_distance(SzIPR_alignment_20_10_2025)
constructor = DistanceTreeConstructor()
SzIPR_tree_20_10_2025 = constructor.nj(dm)

#Rooting Tree with MsACARedog as outgroup
SzIPR_Tree_20_10_2025.root_with_outgroup({"MsACARed": MsACARed})
                                         
                                        
Phylo.draw(SzIPR_tree_20_10_2025)
Phylo.write(SzIPR_tree_20_10_2025, "SzIPR_tree_rooted_with_MsACARedog.nwk", "newick")                                        

KeyboardInterrupt: 

In [None]:
tommyor99@BIO-L2891:~$ sudo apt install fasttree
tommyor99@BIO-L2891:~$ fasttree -gamma -wag IPR014184.with.MsACARed_aligned.fasta > fasttree_GIFaldDH21.10.2025.nwk


In [32]:
with open("IPR014184.with.BtGDFaldDH", "w") as outfile:
    for fname in ["protein-matching-IPR014184.fasta", "GDFaldDH Bacillus.fa"]:
        with open(fname) as infile:
            outfile.write(infile.read())

In [34]:
mafft_cline = MafftCommandline(input="IPR014184.with.BtGDFaldDH")
stdout, stderr = mafft_cline()
with open("IPR014184.with.BtGDFaldDH_aligned.fasta", "w") as f:
    f.write(stdout)

In [None]:
tommyor99@BIO-L2891:~$ fasttree -gamma -wag IPR014184.with.BtGDFaldDH_aligned.fasta > fasttree_GIFaldDH23.10.2025.nwk

In [36]:
with open("IPR014184.with.RhznADH", "w") as outfile:
    for fname in ["protein-matching-IPR014184.fasta", "RhznADHIII.fa"]:
        with open(fname) as infile:
            outfile.write(infile.read())

In [37]:
mafft_cline = MafftCommandline(input="IPR014184.with.RhznADH")
stdout, stderr = mafft_cline()
with open("IPR014184.with.RhznADH_aligned.fasta", "w") as f:
    f.write(stdout)

In [None]:
tommyor99@BIO-L2891:~$ fasttree -gamma -wag IPR014184.with.RhznADH_aligned.fasta > fasttree_GIFaldDH_IPR_Rhz_23.10.2025.nwk

In [None]:
tommyor99@BIO-L2891:~$ cd-hit -i protein-matching-IPR014184.fasta -o clustered_90.fasta -c 0.9 -n 5

In [43]:
with open("IPR014184_clustered-90.with.RhznADH", "w") as outfile:
    for fname in ["protein-matching-IPR014184_clustered_90.fasta", "RhznADHIII.fa"]:
        with open(fname) as infile:
            outfile.write(infile.read())

In [44]:
mafft_cline = MafftCommandline(input="IPR014184_clustered-90.with.RhznADH")
stdout, stderr = mafft_cline()
with open("IPR014184_clustered_90.with.RhznADH_aligned.fasta", "w") as f:
    f.write(stdout)

In [64]:
BmFaldDH = 'A0A0H3KP42'
PaFaldDH = 'Q9HTE3'
PpFaldDH = 'P46154'
RhznADHIII = 'A0ABR5CPA3'

accessions = [BmFaldDH, PpFaldDH, PaFaldDH, RhznADHIII, SzFaldDH]  

for acc in accessions:
    url = f"https://rest.uniprot.org/uniprotkb/{acc}.fasta"
    r = requests.get(url)
    if r.ok:
        with open(f"{acc}.fasta", "w") as f:
            f.write(r.text)
        print(f"Downloaded {acc}.fasta")
    else:
        print(f"Failed to fetch {acc} ({r.status_code})")

Downloaded A0A0H3KP42.fasta
Downloaded P46154.fasta
Downloaded Q9HTE3.fasta
Downloaded A0ABR5CPA3.fasta
Downloaded G2G6S2.fasta


In [65]:
with open("IPR014184_clustered-90.with.RhznADH.with.myFaldDHs", "w") as outfile:
    for fname in ["protein-matching-IPR014184_clustered_90.fasta", SzFaldDH + '.fasta', BmFaldDH + '.fasta' , PaFaldDH + '.fasta', PpFaldDH + '.fasta', "RhznADHIII.fa"]:
        with open(fname) as infile:
            outfile.write(infile.read())

In [66]:
mafft_cline = MafftCommandline(input="IPR014184_clustered-90.with.RhznADH.with.myFaldDHs")
stdout, stderr = mafft_cline()
with open("IPR014184_clustered-90.with.RhznADH.with.myFaldDHs_aligned.fasta", "w") as f:
    f.write(stdout)

In [67]:
tommyor99@BIO-L2891:~$ fasttree -gamma -wag IPR014184_clustered-90.with.RhznADH.with.myFaldDHs_aligned.fasta > fasttree_GIFaldDH_IPR_clust90_Rhz_MyFaldDHs_24.10.2025.nwk

SyntaxError: invalid decimal literal (3657381181.py, line 1)

In [152]:
#Making a full outgroup by blasting the single outgroup sequence RhznADHIII against the NCBI nr database and downloading the top hits as fasta files. Then combining these with the clustered IPR014184 sequences.

!conda install -c bioconda -y blast
!pip install biopython pandas


Channels:
 - bioconda
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 25.7.0
    latest version: 25.9.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.



In [None]:

from Bio.Blast import NCBIWWW
from Bio import SeqIO

In [153]:
#Blast out sequence
with open("RhznADHIII.fa") as f:
    query_seq = f.read()
    result_handle = NCBIWWW.qblast(
    program="blastp",
    database="nr",
    sequence=query_seq,
    hitlist_size=200,
    format_type="FASTA"
)

#Save blast results
blast_output = "Rhzn_blast.fasta"
with open(blast_output, "w") as out_f:
    out_f.write(result_handle.read())

In [158]:
from Bio.Blast import NCBIXML
from Bio import SeqIO

# First make blast database from the input fasta file
!makeblastdb -in protein-matching-IPR014183.fasta -dbtype prot -out IPR014183_db

# Then perform the blast search and output in fasta format
!blastp -query RhznADHIII.fa -db IPR014183_db \
    -outfmt '5' \
    -evalue 1e-5 \
    -max_target_seqs 200 \
    -out RhznADHIII_vs_IPR014183.xml





Building a new DB, current time: 10/28/2025 14:04:35
New DB name:   /home/tommyor99/IPR014183_db
New DB title:  protein-matching-IPR014183.fasta
Sequence type: Protein
Deleted existing Protein BLAST database named /home/tommyor99/IPR014183_db
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 14375 sequences in 0.251407 seconds.


In [159]:
# Convert blast XML output to FASTA

blast_records = NCBIXML.parse(open("RhznADHIII_vs_IPR014183.xml"))
sequences = []

for blast_record in blast_records:
        for alignment in blast_record.alignments:
                sequences.append(alignment.title.split()[0])

# Write sequences to fasta file
with open("RhznADHIII_vs_IPR014183.fasta", "w") as output:
        for record in SeqIO.parse("protein-matching-IPR014183.fasta", "fasta"):
                if record.id in sequences:
                        SeqIO.write(record, output, "fasta")

In [None]:
#combine out blast with IPR database sequences
IPR014184_with_Rhzn_outgroup = "IPR014184_with_Rhzn_outgroup.fasta"

records = list(SeqIO.parse("protein-matching-IPR014184.fasta", "fasta"))
blast_records = list(SeqIO.parse(blast_output, "fasta"))

SeqIO.write(records + blast_records, IPR014184_with_Rhzn_outgroup, "fasta")
print(f"Combined FASTA written to {IPR014184_with_Rhzn_outgroup}")


Combined FASTA written to IPR014184_with_Rhzn_outgroup.fasta



Nowadays, the FASTA file format is usually understood not to have any such comments, and most software packages do not allow them. Therefore, the use of comments at the beginning of a FASTA file is now deprecated in Biopython.


(1) Modify your FASTA file to remove such comments at the beginning of the file.

(2) Use SeqIO.parse with the 'fasta-pearson' format instead of 'fasta'. This format is consistent with the FASTA format defined by William Pearson's FASTA aligner software. Thie format allows for comments before the first sequence; lines starting with the ';' character anywhere in the file are also regarded as comment lines and are ignored.

(3) Use the 'fasta-blast' format. This format regards any lines starting with '!', '#', or ';' as comment lines. The 'fasta-blast' format may be safer than the 'fasta-pearson' format, as it explicitly indicates which lines are comments. 


In [None]:

# Output file
IPR014184_with_Rhzn_outgroup = "IPR014184_with_Rhzn_outgroup.fasta"

# Parse input FASTAs using fasta-blast format
records = list(SeqIO.parse("protein-matching-IPR014184.fasta", "fasta-pearson"))
blast_records = list(SeqIO.parse(blast_output, "fasta-pearson"))

# Combine and write
SeqIO.write(records + blast_records, IPR014184_with_Rhzn_outgroup, "fasta")
print(f"Combined FASTA written to {IPR014184_with_Rhzn_outgroup}")

Combined FASTA written to IPR014184_with_Rhzn_outgroup.fasta


In [None]:
conda install -c bioconda mafft

Channels:
 - bioconda
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 25.7.0
    latest version: 25.9.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /home/tommyor99/miniforge3/envs/jupyter

  added / updated specs:
    - mafft


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    gawk-5.3.1                 |       hcd3d067_0         1.1 MB  conda-forge
    gmp-6.3.0                  |       hac33072_2         449 KB  conda-forge
    mafft-7.525                |       h031d066_1         3.3 MB  bioconda
    mpfr-4.2.1                 |       h90cbb55_3         620 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         5.5 MB

The following NEW p

In [None]:
from Bio import SeqIO
list(SeqIO.parse("combined_outgroup.fasta", "fasta"))

[SeqRecord(seq=Seq('MSGNRGVVYLGNGKVEVQKIDYPKMQDPRGRKIEHGVILRVVSTNICGSDQHMV...SAA'), id='A0A010RT37|unreviewed|Glutathione-independent', name='A0A010RT37|unreviewed|Glutathione-independent', description='A0A010RT37|unreviewed|Glutathione-independent formaldehyde dehydrogenase|taxID:1042209', dbxrefs=[]),
 SeqRecord(seq=Seq('MSGNRGVVYLGAGKVEVQTIPYPKMEDPRGKRIDHGVILRVVSTNICGSDQHMV...SAA'), id='A0A010SQR9|unreviewed|Glutathione-independent', name='A0A010SQR9|unreviewed|Glutathione-independent', description='A0A010SQR9|unreviewed|Glutathione-independent formaldehyde dehydrogenase|taxID:1042209', dbxrefs=[]),
 SeqRecord(seq=Seq('MSKNRGVVYLRPGTVEVRDIEDPALAAPDGRKLDHAVILKVISTNICGSDQHMV...KVA'), id='A0A011T3Q6|unreviewed|Formaldehyde', name='A0A011T3Q6|unreviewed|Formaldehyde', description='A0A011T3Q6|unreviewed|Formaldehyde dehydrogenase, glutathione-independent|taxID:529', dbxrefs=[]),
 SeqRecord(seq=Seq('MASNRGVVYLGPGKVEVQSIDYPKFVDPRGKEIHHGVILKVVSTNICGSDQHMV...GLL'), id='A0A017T2E6|unreviewed|

In [None]:
import subprocess

input_fasta = "combined_outgroup.fasta"
aligned_fasta = "combined_outgroup_aligned.fasta"

cmd = ["mafft", "--auto", "--thread", "1", input_fasta]

with open(aligned_fasta, "w") as out_f:
    subprocess.run(cmd, stdout=out_f, check=True)

print(f"Alignment written to {aligned_fasta}")

nthread = 1
nthreadpair = 1
nthreadtb = 1
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00
=== 
=== Alphabet 'O' is unknown.
=== Please check site 7 in sequence 3385.
=== 
=== To make an alignment that has unusual characters (U, @, #, etc), try  
=== % mafft --anysymbol input > output
=== 
Illegal character O


CalledProcessError: Command '['mafft', '--auto', '--thread', '1', 'combined_outgroup.fasta']' returned non-zero exit status 1.

Cleaned FASTA written to IPR014184_with_Rhznoutgroup_clean.fasta


In [None]:
#align with mafft

import subprocess

input_fasta = "IPR014184_with_Rhznoutgroup_clean.fasta"
aligned_fasta = "IPR014184_with_Rhznoutgroup_aligned.fasta"

cmd = ["mafft", "--auto", "--thread", "1", "--anysymbol", input_fasta]

with open(aligned_fasta, "w") as out_f:
    subprocess.run(cmd, stdout=out_f, check=True)

print(f"Alignment written to {aligned_fasta}")



inputfile = orig
3584 x 2526 - 348 p
nthread = 1
nthreadpair = 1
nthreadtb = 1
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..

There are 47648 ambiguous characters.
 3501 / 3584 (thread    0)
done.

Constructing a UPGMA tree (efffree=0) ... 
 3580 / 3584
done.

Progressive alignment 1/2... 
STEP  1701 / 3583 (thread    0)
Reallocating..done. *alloclen = 7126
STEP  2101 / 3583 (thread    0)
Reallocating..done. *alloclen = 10301
STEP  3501 / 3583 (thread    0)
done.

Making a distance matrix from msa.. 
 3500 / 3584 (thread    0)
done.

Constructing a UPGMA tree (efffree=1) ... 
 3580 / 3584
done.

Progressive alignment 2/2... 
STEP  2001 / 3583 (thread    0)
Reallocating..done. *alloclen = 6450
STEP  2301 / 3583 (thread    0)
Reallocating..done. *alloclen = 7543
STEP  2901 / 3583 (thread    0) h
Reallocating..done. *alloclen = 8658
STEP  3401 / 3583 (thread    0)
Reallocating..done. *alloclen = 9735
STEP  3501 / 3583 (thre

Alignment written to IPR014184_with_Rhznoutgroup_aligned.fasta


In [126]:
#looks like there are alot of weird characters. Checking to see what they are

from Bio import SeqIO

input_file = "combined_outgroup.fasta"
valid_aa = set("ACDEFGHIKLMNPQRSTVWY")  # only standard amino acids

for rec in SeqIO.parse(input_file, "fasta"):
    seq = str(rec.seq).upper()
    ambiguous = set(seq) - valid_aa
    if ambiguous:
        print(f">{rec.id} has ambiguous characters: {', '.join(sorted(ambiguous))}")

>A0A061S3Q9|unreviewed|Alcohol has ambiguous characters: X
>A0A0D6E3U6|unreviewed|Formaldehyde has ambiguous characters: X
>A0A0P9Y2X2|unreviewed|Alcohol has ambiguous characters: X
>A0A2A4VJN1|unreviewed|Formaldehyde has ambiguous characters: X
>A0A2E2SW66|unreviewed|Formaldehyde has ambiguous characters: X
>A0A9D5TY48|unreviewed|Formaldehyde has ambiguous characters: X
>WP_045022451.1 has ambiguous characters: %, (, ), ,, -, ., /, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, =, B, J, O, U, X, Z, [, ]
>WP_332302355.1 has ambiguous characters: %, (, ), +, ,, ., /, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, =, B, J, O, U, X, Z, [, ]
>WP_045024761.1 has ambiguous characters: %, (, ), +, ,, -, ., /, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, =, B, J, O, U, X, Z, [, ]
>WP_374636746.1 has ambiguous characters: %, (, ), +, ,, -, ., /, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, =, B, J, O, U, X, Z, [, ]
>MGG2475292.1 has ambiguous characters: %, (, ), +, ,, ., /, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, :, =, B, J, O, U, X, Z, [, ]
>WP_00631444

In [None]:
#downloaded a big fasta file for the type III ADHs, now blasting these against the RhznADHIII sequence to find the closest 200 relatives to use as outgroup sequences
from Bio.Blast import NCBIWWW
from Bio import SeqIO

!conda install -c bioconda -y blast
!pip install biopython pandas




!blastp -query RhznADHIII.fa -db IPR014183_db \
  -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" \
  -evalue 1e-5 -max_target_seqs 200 -num_threads 4 -out RhznADHIII_IPR014183_blast_correctformat.tsv


Channels:
 - bioconda
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 25.7.0
    latest version: 25.9.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.



In [151]:
import pandas as pd
cols = ["qseqid","sseqid","pident","length","mismatch","gapopen","qstart","qend","sstart","send","evalue","bitscore"]
df = pd.read_csv("RhznADHIII_IPR014183_blast_correctformat.tsv", sep="\t", names=cols)
df = df.sort_values(["evalue","bitscore"], ascending=[True,False])
display(df.head(50))

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,A0ABR5CPA3,A0ABR5CPA3|unreviewed|Alcohol,100.0,375,0,0,1,375,1,375,0.0,769
1,A0ABR5CPA3,A0ABR5CZ58|unreviewed|Alcohol,98.933,375,4,0,1,375,1,375,0.0,764
2,A0ABR5CPA3,A9CIS7|unreviewed|S-(hydroxymethyl)glutathione,98.4,375,6,0,1,375,1,375,0.0,759
3,A0ABR5CPA3,A0A3G2D4K7|unreviewed|S-(hydroxymethyl)glutath...,98.4,375,6,0,1,375,1,375,0.0,759
4,A0ABR5CPA3,A0ABY3BTN4|unreviewed|S-(Hydroxymethyl)glutath...,98.133,375,7,0,1,375,1,375,0.0,759
5,A0ABR5CPA3,A0AB36EPA4|unreviewed|S-(hydroxymethyl)glutath...,98.133,375,7,0,1,375,1,375,0.0,758
6,A0ABR5CPA3,A0AA94VF32|unreviewed|S-(hydroxymethyl)glutath...,98.133,375,7,0,1,375,1,375,0.0,758
7,A0ABR5CPA3,A0AAJ4N1B6|unreviewed|S-(hydroxymethyl)glutath...,97.867,375,8,0,1,375,1,375,0.0,758
8,A0ABR5CPA3,A0A9X3QXT2|unreviewed|S-(hydroxymethyl)glutath...,97.867,375,8,0,1,375,1,375,0.0,758
9,A0ABR5CPA3,A0ABS3EJK7|unreviewed|S-(Hydroxymethyl)glutath...,97.6,375,9,0,1,375,1,375,0.0,757


In [148]:
#combine RhznADHIII_vs_IPR014184.fasta with protein-matching-IPR014184.fasta
from Bio import SeqIO

def combine_fasta_files():
    # Open and read the first file
    with open('RhznADHIII_vs_IPR014183.fasta', 'r') as file1:
        content1 = file1.read()
    
    # Open and read the second file
    with open('protein-matching-IPR014184.fasta', 'r') as file2:
        content2 = file2.read()
    
    # Combine and write to new file
    with open('IPR014184_with_IPR014183_outgroup.fasta', 'w') as outfile:
        outfile.write(content1)
        # Add newline if needed between files
        if not content1.endswith('\n'):
            outfile.write('\n')
        outfile.write(content2)

if __name__ == '__main__':
    try:
        combine_fasta_files()
        print("Files successfully combined into 'IPR014184_with_IPR014183_outgroup.fasta'")
    except FileNotFoundError:
        print("Error: One or both input files not found")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Files successfully combined into 'IPR014184_with_IPR014183_outgroup.fasta'


In [144]:
#find sequences with ambiguous characters

ambiguous_seqs = []
for rec in SeqIO.parse(input_file, "fasta"):
    seq = str(rec.seq).upper()
    seq_letters = ''.join(c for c in seq if c.isalpha())
    ambiguous = sorted(set(seq_letters) - valid_aa)
    if ambiguous:
        ambiguous_seqs.append((rec.id, ambiguous))
        print(f"{rec.id}: ambiguous characters: {', '.join(ambiguous)}")

if not ambiguous_seqs:
    print("No sequences with ambiguous characters found.")

A0A061S3Q9|unreviewed|Alcohol: ambiguous characters: X
A0A0D6E3U6|unreviewed|Formaldehyde: ambiguous characters: X
A0A0P9Y2X2|unreviewed|Alcohol: ambiguous characters: X
A0A2A4VJN1|unreviewed|Formaldehyde: ambiguous characters: X
A0A2E2SW66|unreviewed|Formaldehyde: ambiguous characters: X
A0A9D5TY48|unreviewed|Formaldehyde: ambiguous characters: X


In [145]:
mafft_cline = MafftCommandline(input="IPR014184_with_correct_Rhzn_outgroup.fasta")
stdout, stderr = mafft_cline()
with open("IPR014184_with_correct_Rhzn_outgroup_aligned.fasta", "w") as f:
    f.write(stdout)

In [None]:
tommyor99@BIO-L2891:~$ fasttree -gamma -wag IPR014184_with_correct_Rhzn_outgroup_aligned.fasta > fasttree_FaldDH_ADHIIIog_28.10.2025.nwk