In [11]:
import cogent3
from cogent3 import load_aligned_seqs
from cogent3 import get_app
from phylim.apps import phylim


def sample_one_per_species(names, species_key=None):
    """Return one random name per species from a list of sequence names.

    - species_key: optional function to extract species id from a name (default: part before ':')
    - seed: optional int to make selection deterministic
    """
    import random
    seed = 7
    random.seed(seed)

    if species_key is None:
        species_key = lambda s: s.split(":")[0]

    by_species = {}
    for n in names:
        sp = species_key(n)
        by_species.setdefault(sp, []).append(n)

    # choose one random name per species
    sampled = [random.choice(group) for group in by_species.values()]
    return sampled

@cogent3.app.composable.define_app
def renamer_sampler(seqs: cogent3.app.typing.AlignedSeqsType) -> cogent3.app.typing.AlignedSeqsType:
    """
    A function to rename sequences in a dataset.
    """
    allnames = seqs.names

    samplednames = sample_one_per_species(allnames, species_key=None)
    sampledseqs = seqs.take_seqs(samplednames)

    name_map = {
        "homo_sapiens": "Human",
        "pan_troglodytes": "Chimpanzee",
        "gorilla_gorilla": "Gorilla"
    }

    sampledseqs = sampledseqs.rename_seqs(lambda x: name_map.get(x.split(":")[0], x))

    return sampledseqs.take_seqs(list(name_map.values()))

@cogent3.app.composable.define_app
def renamer(seqs: cogent3.app.typing.AlignedSeqsType) -> cogent3.app.typing.AlignedSeqsType:
    """
    A function to rename sequences in a dataset.
    """
    name_map = {
        "homo_sapiens": "Human",
        "pan_troglodytes": "Chimpanzee",
        "gorilla_gorilla": "Gorilla"
    }

    seqs = seqs.rename_seqs(lambda x: name_map.get(x.split(":")[0], x))

    return seqs.take_seqs(list(name_map.values()))

In [12]:
folder_in = '/home/uliseshmc/Documents/Proyectos/EstimatingUd/Data/Gavin_apes114/test_intergenicAR_1column/'
in_dstore = cogent3.open_data_store(folder_in, suffix='fa', mode='r')

#in_dstore.describe

#sequence = 'homo_sapiens-22-15915800-16141765.fa'
sequence = 'homo_sapiens-22-17353177-17359947.fa'


aln = load_aligned_seqs(filename = folder_in + sequence, format="fasta", moltype='dna')
aln.set_repr_policy(num_pos=40)
aln = aln.no_degenerates()
rename = renamer()
aln = rename(aln)
aln

reason='no longer has an effect'
  deprecated(


0,1
,0
Chimpanzee,AATCCTAGCACACACT
Human,................
Gorilla,C..T.....T......
