In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

In [2]:
amino_acids = {
    "G": "NCC(=O)",         # Gly
    "A": "NC(C)C(=O)",      # Ala
    "V": "NC(C(C)C)C(=O)",  # Val
    "L": "NC(CC(C)C)C(=O)", # Leu
    "I": "NC(C(C)CC)C(=O)", # Ile
    "S": "NC(CO)C(=O)",     # Ser
    "T": "NC(C(O)C)C(=O)",  # Thr
    "D": "NC(CC(=O)O)C(=O)",# Asp
    "E": "NC(CCC(=O)O)C(=O)",# Glu
    "N": "NC(CC(N)=O)C(=O)",# Asn
    "Q": "NC(CCC(N)=O)C(=O)",# Gln
    "K": "NC(CCCCN)C(=O)",  # Lys
    "R": "NC(CCCNC(N)=N)C(=O)",# Arg
    "H": "NC(Cc1c[nH]cn1)C(=O)",# His
    "F": "NC(Cc1ccccc1)C(=O)",# Phe
    "Y": "NC(Cc1ccc(O)cc1)C(=O)",# Tyr
    "W": "NC(Cc1c2ccccc2[nH]1)C(=O)",# Trp
    "P": "N1CCCC1C(=O)",    # Pro
    "M": "NC(CCSC)C(=O)",   # Met
    "C": "NC(CS)C(=O)",     # Cys
}

aa_smarts_to_code = {Chem.MolFromSmarts(v): k for k, v in amino_acids.items()}

In [3]:
def smiles_to_sequence(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    matches = []
    for patt, code in aa_smarts_to_code.items():
        substructs = mol.GetSubstructMatches(patt)
        for match in substructs:
            matches.append((match[0], code))  # (atom_idx, AA code)
    
    if not matches:
        return None

    # Sort by atom index (N-terminal to C-terminal)
    matches = sorted(matches, key=lambda x: x[0])
    seq = ''.join([code for _, code in matches])
    return seq


In [4]:
df = pd.read_csv("identified_peptides.csv")
df['sequence'] = df['canonical SMILES'].apply(smiles_to_sequence)
df = df[df['sequence'].notnull()]
df.to_csv("identified_peptides_with_sequences.csv", index=False)