In [1]:
data_path = '../data/Philippe2009/UPDUNN_MB.nex'

In [2]:
from reprophylo import *
def parse_philippe_mrbayes_partitions(mrbayes_nexus):
    with open(mrbayes_nexus) as partfile:
        lines = partfile.readlines()
        parts = {}
        gene_names = []
        ranges = []
        for line in lines:
            if '[genes' in line:
                gene_names = line.rstrip().split(':')[1].replace(']','').split(',')
            if 'partition part = ' in line:
                ranges = line.rstrip().split(':')[1].replace(';','').split(',')
                ranges = [i.split('-') for i in ranges]
        for j in range(len(gene_names)):
            parts[gene_names[j]] = [int(ranges[j][0])-1,
                                    int(ranges[j][1])]
        return parts
    

def pj_from_nexus(phylip_filename, charsets ,output_dir, char_type,
                            feature_type, project=False, pickle=False, git=False):
    
    
    from reprophylo import Locus
    from Bio import AlignIO
    
    alignment =  AlignIO.read(phylip_filename, 'nexus')
    filenames = []
    loci_list = []
    for locus_name in charsets:
        s = charsets[locus_name][0]
        e = charsets[locus_name][1]
        outname = "%s/%s.fasta"%(output_dir,locus_name)
        AlignIO.write(alignment[:, s:e], outname, 'fasta')
        filenames.append(outname)
        loci_list.append(Locus(char_type, feature_type, locus_name, [locus_name]))
    
    if project:
        from reprophylo import Project
        pj = Project(loci_list, pickle=pickle, git=git)
        i=1
        for f in filenames:
            locus_name = f.split('/')[-1].split('.')[0]
            print '%i/%i reading %s'%(i,len(filenames), locus_name)
            i += 1
            pj.read_alignment(f, char_type, feature_type, locus_name)
        return pj
            
    else:  
        return filenames, loci_list

In [3]:
if not os.path.exists('../data/Philippe2009/fastaalns'):
    os.mkdir('../data/Philippe2009/fastaalns')
    
fasta_dir = '../data/Philippe2009/fastaalns'

In [None]:
charsets = parse_philippe_mrbayes_partitions(data_path)
pj = pj_from_nexus(data_path,
                   charsets,
                   fasta_dir,
                   'prot',
                   'PROT',
                    project=True,
                    pickle='Philippe2009',
                    git=True
                  )

DEBUG:Cloud:Log file (/home/amir/.picloud/cloud.log) opened


1/150 reading tribe1378
dropping seq tribe1378 in locus Saccharomyces_cerevisiae: missing data
dropping seq tribe1378 in locus Euperipatoides_kanangrensis: missing data
dropping seq tribe1378 in locus Spinochordodes_tellinii: missing data
dropping seq tribe1378 in locus Boophilus_microplus: missing data
dropping seq tribe1378 in locus Acanthoscurria_gomesiana: missing data
dropping seq tribe1378 in locus Anoplodactylus_eroticus: missing data
dropping seq tribe1378 in locus Carcinoscorpius_rotundicauda: missing data
dropping seq tribe1378 in locus Carcinus_maenas: missing data
dropping seq tribe1378 in locus Biomphalaria_glabrata: missing data
dropping seq tribe1378 in locus Mytilus_galloprovincialis: missing data
dropping seq tribe1378 in locus Argopecten_irradians: missing data
dropping seq tribe1378 in locus Euprymna_scolopes: missing data
dropping seq tribe1378 in locus Chaetopleura_apiculata: missing data
dropping seq tribe1378 in locus Chaetoderma_nitidulum: missing data
dropping 

In [None]:
print pj.records[0].format('genbank')

In [None]:
# mock trimming
trimal = TrimalConf(pj, method_name='noallgaps', trimal_commands={'noallgaps': True})

In [None]:
pj.trim([trimal])

In [None]:
loci_entropy = LociStats(pj)
loci_entropy.sort()
%matplotlib inline
loci_entropy.plot('figures/sort_by_entropy.png', figsize=[50,40], ytick_fsize=8)

In [20]:
pj.write_trimmed_alns()

['rpl36@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe1118@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe380_mt@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'rpl14a@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe320@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe896@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe1245@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'rps20@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe1190@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe782@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe942@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe905@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe739@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe742@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'rpl18@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe613@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'tribe1121@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'rps8@ReadDirectly@noallgaps_trimmed_aln.fasta',
 'rps16@ReadDirectly@noallgaps_trimm