In [44]:
import cogent3
from cogent3 import load_aligned_seqs
from cogent3 import get_app

@cogent3.app.composable.define_app
def renamer(seqs: cogent3.app.typing.AlignedSeqsType) -> cogent3.app.typing.AlignedSeqsType:
    """
    A function to rename sequences in a dataset.
    """

    name_map = {
        "homo_sapiens": "Human",
        "pan_troglodytes": "Chimpanzee",
        "gorilla_gorilla": "Gorilla"
    }

    seqs = seqs.rename_seqs(lambda x: name_map.get(x.split(":")[0], x))

    return seqs.take_seqs(list(name_map.values()))

def sample_one_per_species(names, species_key=None):
    """Return one random name per species from a list of sequence names.

    - species_key: optional function to extract species id from a name (default: part before ':')
    - seed: optional int to make selection deterministic
    """
    import random
    seed = 10
    random.seed(seed)

    if species_key is None:
        species_key = lambda s: s.split(":")[0]

    by_species = {}
    for n in names:
        sp = species_key(n)
        by_species.setdefault(sp, []).append(n)

    # choose one random name per species
    sampled = [random.choice(group) for group in by_species.values()]
    return sampled


In [45]:
import cogent3
from cogent3 import load_aligned_seqs
from cogent3 import get_app

def sample_one_per_species(names, species_key=None):
    """Return one random name per species from a list of sequence names.

    - species_key: optional function to extract species id from a name (default: part before ':')
    - seed: optional int to make selection deterministic
    """
    import random
    seed = 7
    random.seed(seed)

    if species_key is None:
        species_key = lambda s: s.split(":")[0]

    by_species = {}
    for n in names:
        sp = species_key(n)
        by_species.setdefault(sp, []).append(n)

    # choose one random name per species
    sampled = [random.choice(group) for group in by_species.values()]
    return sampled

@cogent3.app.composable.define_app
def renamer_sampler(seqs: cogent3.app.typing.AlignedSeqsType) -> cogent3.app.typing.AlignedSeqsType:
    """
    A function to rename sequences in a dataset.
    """
    allnames = seqs.names

    samplednames = sample_one_per_species(allnames, species_key=None)
    sampledseqs = seqs.take_seqs(samplednames)

    name_map = {
        "homo_sapiens": "Human",
        "pan_troglodytes": "Chimpanzee",
        "gorilla_gorilla": "Gorilla"
    }

    sampledseqs = sampledseqs.rename_seqs(lambda x: name_map.get(x.split(":")[0], x))

    return sampledseqs.take_seqs(list(name_map.values()))


In [47]:
folder_in = '/home/uliseshmc/Documents/Proyectos/EstimatingUd/Data/Gavin_apes114/test_intergenic_1column/'
in_dstore = cogent3.open_data_store(folder_in, suffix='fa', mode='r')

#in_dstore.describe

#sequence = 'homo_sapiens-22-15915800-16141765.fa'
sequence = 'homo_sapiens-22-17353177-17359947.fa'


aln = load_aligned_seqs(filename = folder_in + sequence, format="fasta", moltype='dna')
aln.set_repr_policy(num_pos=40)

aln

reason='no longer has an effect'
  deprecated(


0,1
,0
pan_troglodytes:22:4538060-4540232:-1,????????????????????????????????????????
homo_sapiens:22:17357760-17359947:-1,........................................
gorilla_gorilla:22:1056208-1056635:-1,----------------------------------------


In [53]:
rename = renamer_sampler()
aln = rename(aln)


In [54]:
aln2 = aln.no_degenerates()
aln2.set_repr_policy(num_pos=40)
aln2

0,1
,0
Chimpanzee,AATCCTAGCACACACT
Human,................
Gorilla,C..T.....T......


In [None]:
sampled_names = sample_one_per_species(allnames)
sampled_aln = aln.take_seqs(sampled_names)
sampled_aln

In [75]:
samplealn = aln.take_seqs("homo_sapiens:22:16105098-16141765:-1")
startSINEregion = 16105523
endSINEregion = 16105806
startposinframe = startSINEregion-16105098
endposinframe = endSINEregion-16105098
sample2 = samplealn[0:endposinframe]
sample2.set_repr_policy(num_pos=4000)
sample2

0,1
,0
homo_sapiens:22:16105098-16141765:-1,????????????????????????????????????????????????????????????
,60
homo_sapiens:22:16105098-16141765:-1,????????????????????????????????????????????????????????????
,120
homo_sapiens:22:16105098-16141765:-1,????????????????????????????????????????????????????????????
,180
homo_sapiens:22:16105098-16141765:-1,????????????????????????????????????????????????????????????
,240
homo_sapiens:22:16105098-16141765:-1,????????????????????????????????????????????????????????????


In [67]:
folder_in = '/home/uliseshmc/Documents/Proyectos/EstimatingUd/Data/Gavin_apes114/intergenic_2column/'
in_dstore = cogent3.open_data_store(folder_in, suffix='fa', mode='r')

sequence = 'homo_sapiens-22-15915800-16141765.fa'
aln = load_aligned_seqs(filename = folder_in + sequence, format="fasta", moltype='dna')
aln.set_repr_policy(num_pos=40)

aln2 = aln.no_degenerates()
aln2.set_repr_policy(num_pos=40)
aln2

reason='no longer has an effect'
  deprecated(


0,1
,0
homo_sapiens:14:18305967-18343036:-1,TGGCTAAGAATATGTACGTGAATTTTAAAAACGTAGTAAC
homo_sapiens:18:15320945-15357916:1,.....G................C...........CA....
pan_troglodytes:KV420794.1:43768-80951:1,.....G..........T.....C.................
homo_sapiens:22:16105098-16141765:-1,.....G...........A....C.................
homo_sapiens:21:13010300-13047581:-1,.....G................C...........CA....
pan_troglodytes:2B:18548621-18585375:1,.....G..........T.....C.................
homo_sapiens:22:15236759-15273853:-1,........................................
homo_sapiens:9:64880476-64916438:1,.....G................C............A....


In [68]:
samplealn = aln.take_seqs("homo_sapiens:22:16105098-16141765:-1")
startSINEregion = 16105523
startposinframe = 16105523-16105098
SINEregion = "GGCAGGGCGCGGTGGCTCACGCCTATAATCCCAGTACTTTGGGAGGCTGA"
sample2 = samplealn[startposinframe-425:startposinframe+182]
sample2.set_repr_policy(num_pos=4000)
sample2

0,1
,0
homo_sapiens:22:16105098-16141765:-1,????????????????????????????????????????????????????????????
,60
homo_sapiens:22:16105098-16141765:-1,????????????????????????????????????????????????????????????
,120
homo_sapiens:22:16105098-16141765:-1,????????????????????????????????????????????????????????????
,180
homo_sapiens:22:16105098-16141765:-1,????????????????????????????????????????????????????????????
,240
homo_sapiens:22:16105098-16141765:-1,????????????????????????????????????????????????????????????


In [38]:
loader = get_app("load_aligned", moltype="dna") 
rename = renamer()
concat = get_app("concat", moltype="dna")

#app = loader + rename
app = loader

results_allpos = [r for r in app.as_completed(in_dstore[7:8], parallel=True) if r]
results_allpos = concat(results_allpos)
results_allpos.set_repr_policy(num_pos=10)
results_allpos

AttributeError: 'NotCompleted' object has no attribute 'set_repr_policy'