In [1]:
import itertools
import sys, os

import numpy as np
import pandas as pd
from scipy.special import comb
from scipy import stats
import scipy.cluster.hierarchy as hac
import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
sns.set(rc={'figure.figsize':(15,8)})
sns.set_context('poster')

In [2]:
%load_ext autoreload
%autoreload 2
#import CCPA_lib as cp


In [3]:
sns.set_context('poster')

### Catalytic Promiscuity in the Biosynthesis of Cyclic Peptide in Planktonic Marine Cyanobacteria, PNAS2010
Detection of procM and procA gene sequences in marine cyanobacterial genomes.
1. **procM** gene sequences were detected in marine cyanobacterial genomes using the **pre-computed phylogenetic tree viewer in the microbesonline database** (http://www.microbesonline.org (3), which is computed using FastBLAST (4), see http://www.microbesonline.org/treebrowseHelp.html for additional details). 
2. **procA** sequences were detected in the genomes of Prochlorococcus **MIT9313** by **manually searching for short ORFs close to the procM gene** that encoded a **peptide** with a **leader sequence** followed by a **double-glycine** protease recognition site and a **short** core peptide rich in **Cys, Ser and Thr** residues (Fig. 1C). 
3. Once several such putative lantipeptides were discovered, we used **BLASTp** in both **microbesonline** and the **NCBI nr** database (with a cutoff **e-value of 10**) to detect other, similar genes in cyanobacterial genomes
4.  **manually** checked the resulting peptide sequences to verify that they agreed with the definition of procAs, as described above. 
5. Figure 1 and Figure S1 were produced using IMG (http://img.jgi.doe.gov/cgi-bin/pub/main.cgi).

### Evolutionary radiation of lanthipeptides in marine cyanobacteria, PNAPLUS 2017
1. Annotation of **prochlorosin** biosynthesis genes was performed manually using **BLAST** (12). 
2. The sequences of the **ProcM, LanT and LanOM** proteins from Prochlorococcus **MIT9313** were used as queries for the search of homologous proteins. 
3. Top hits were further inspected against the **NCBI Conserved Domain Database (CDD)** (13) to verify the presence of signature domains.
4. The **tBLASTn** algorithm was used to search **procA and procA pseudogenes** using a **consensus sequence** derived from **29 prochlorosin leader peptide** sequences from Prochlorococcus **MIT9313**. 
5. BLAST hits with **>45% identity** and **E-val <0.001** were inspected against the CDD to confirm the presence of the **nif11-like leader peptide domain**.


![](http://oregonstate.edu/instruct/bb450/fall14/stryer7/2/table_02_02.jpg)

https://www.uniprot.org/uniprot/Q7V735
    http://tigrfams.jcvi.org/cgi-bin/HmmReportPage.cgi?acc=TIGR03798
        https://www.ebi.ac.uk/training/online/course/interpro-functional-and-structural-analysis-protei/sequence-searching/searching-interpro-batc
            http://www.ebi.ac.uk/interpro/sequencesearch/iprscan5-S20190707-131508-0462-76111813-p1m
            https://www.ebi.ac.uk/Tools/services/rest/iprscan5/result/iprscan5-S20190707-131508-0462-76111813-p1m/json

In [10]:
genomes_dpath = r'data\detailed_Prochlorococcus_genome_annotations'
genome_fnames = [ n for n in os.listdir(genomes_dpath) if n.endswith('.txt')]
def _load(fname):
    df = pd.read_csv(os.path.join(genomes_dpath, fname), sep='\t')
    df['genome'] = os.path.basename(os.path.splitext(fname)[0])
    return df
genome_df = pd.concat([_load(fname) for fname in genome_fnames])


In [15]:
genome_df.head()

Unnamed: 0,contig_id,gene_id,feature_id,type,location,start,stop,strand,function,aliases,figfam,evidence_codes,nucleotide_sequence,aa_sequence,genome,aa_length,gg_index
0,AS9601,PAS9601_0001,fig|1218.83.peg.1,peg,AS9601_168_1325,168,1325,+,DNA polymerase III beta subunit (EC 2.7.7.7),,FIG00066425,isu;DNA-replication isu;DNA_replication_cluster_1,atggaaattatttgtaatcaaaatgaattaaataatgctatacaac...,MEIICNQNELNNAIQLVSKAVASRPTHPILANILLTADEGTNKISV...,AS9601,385.0,-1.0
1,AS9601,PAS9601_0002,fig|1218.83.peg.2,peg,AS9601_1327_2034,1327,2034,+,RNA metabolism-related protein,,,,ttgaaattacctaaagaaattttattaagtgaattattaaattata...,MKLPKEILLSELLNYIVKGNMVLNYGNGENVWMHPPVHRILGWYSR...,AS9601,235.0,-1.0
2,AS9601,PAS9601_0003,fig|1218.83.peg.3,peg,AS9601_2038_4377,2038,4377,+,"Phosphoribosylformylglycinamidine synthase, sy...",,FIG01303876,icw(1);De_Novo_Purine_Biosynthesis,atgataaatcatgaaaataatgatctatttgatcttaatgaagcat...,MINHENNDLFDLNEALKVENLTLNDYEEICKRLKRKPNRTELGMFG...,AS9601,779.0,114.0
3,AS9601,PAS9601_0004,fig|1218.83.peg.4,peg,AS9601_4425_5885,4425,5885,+,Amidophosphoribosyltransferase (EC 2.4.2.14),,FIG00000179,isu;YgfZ isu;De_Novo_Purine_Biosynthesis,atgtgcggaatagttggaatcgtttcttcgaatgatgtaaatcaac...,MCGIVGIVSSNDVNQQIYDSLLLLQHRGQDSTGIATMENTVFHIHK...,AS9601,486.0,-1.0
4,AS9601,PAS9601_0005,fig|1218.83.peg.5,peg,AS9601_8323_5882,8323,5882,-,DNA gyrase subunit A (EC 5.99.1.3),,FIG00000080,idu(1);DNA_gyrase_subunits idu(1);DNA_topoisom...,atggataagaaaaatttcacttccatatcacttcaagaagaaatgc...,MDKKNFTSISLQEEMQRSYLEYAMSVIVGRALPDARDGLKPVQRRI...,AS9601,813.0,218.0


In [12]:
genome_df['aa_length'] = genome_df['aa_sequence'].str.len()

In [14]:
genome_df['gg_index'] = genome_df['aa_sequence'].str.find('GG')
genome_df['ga_index'] = genome_df['aa_sequence'].str.find('GA')
def min_index(gg_index, ga_index):
    if gg_index == -1: 
        return ga_index
    if ga_index == -1: 
        return gg_index
    return min(ga_index, gg_index)
genome_df['leader_end_index'] = min_index(genome_df['ga_index'], genome_df['gg_index'])


In [None]:
genome_df['aa_sequence_post_gg'] = 
genome_df['SCT'] = genome_df['aa_sequence'].str.find('GG')