# Code snippets for working with the proFASTA library

## Removing invalid characters from imported protein sequences

In [1]:
import profasta


def cleanup_protein_sequences(
        db: profasta.ProteinDatabase, alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    ) -> None:
    """Remove non-alphabet characters from protein sequences in the ProteinDatabase.
    
    Args:
        db: A profasta.ProteinDatabase instance.
        alphabet: List of characters that are allowed in the protein entry sequences.
    """
    for entry in db.values():        
        entry.sequence = "".join([aa for aa in entry.sequence if aa in alphabet])


fasta_path = "./uniprot_hsapiens_10entries.fasta"
db = profasta.db.ProteinDatabase()
db.add_fasta(fasta_path, header_parser="uniprot")
db["O75385"].sequence = "MEPG_-+123"
cleanup_protein_sequences(db)

print(db["O75385"].sequence)

MEPG


## Converting FASTA headers into a UniProt like format

In [2]:
import profasta
import profasta.parser


class CustomHeaderParser:
    """Custom header parser."""

    @classmethod
    def parse(cls, header: str) -> profasta.parser.ParsedHeader:
        """Parse a FASTA header string into a ParsedHeader object.
        
        Header format example:
        >ProteinID hypothetical protein name
        """
        split_header = header.split(maxsplit=1)
        _id = split_header[0]

        fields = {
            "db": "xx",
            "identifier": _id,
            "entry_name": f"{_id}_CUSTOM",
            "gene_name": _id,
        }
        if len(split_header) > 1:
            fields["protein_name"] = split_header[1]
        return profasta.parser.ParsedHeader(_id, header, fields)

# Register the custom header parser so that it can be used by the ProteinDatabase.
profasta.parser.register_parser("custom_parser", CustomHeaderParser)

fasta_path = "./custom_header_format.fasta"
converted_fasta_path =  "./custom_header_format.uniprot-like.fasta"
protein_db = profasta.ProteinDatabase()

# Specify the custom header parser to use for adding the FASTA file.
protein_db.add_fasta(fasta_path, header_parser="custom_parser")

# Write the ProteinDatabase to a new FASTA file using the uniprot-like header writer.
protein_db.write_fasta(converted_fasta_path, header_writer="uniprot_like")

## Create a combined FASTA file with added decoy entries



In [3]:
import profasta

fasta_path = "./uniprot_hsapiens_10entries.fasta"
decoy_fasta_path = "./uniprot_hsapiens_10entries_DECOY.fasta"

# Import the FASTA file
db = profasta.db.ProteinDatabase()
db.add_fasta(fasta_path, header_parser="uniprot")

# Create the new FASTA file and write the original entries to it.
db.write_fasta(decoy_fasta_path, header_writer="uniprot")

# Create a decoy database from the original database, containing reversed sequences.
decoy_db = profasta.create_decoy_db(db, keep_nterm_methionine=True)

# Append the decoy entries to the new FASTA file.
decoy_db.write_fasta(decoy_fasta_path, header_writer="decoy", append=True)