In [1]:
import pickle
import pandas as pd
import cobra
from collections import defaultdict, OrderedDict, Counter
from cobra import Reaction, Metabolite, Model, Gene
from glob import glob

from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.patches as mpatches

from Bio import SeqIO, Seq
import re
import os
import scipy
import urllib
import scipy.stats as stats
from statsmodels.stats.multitest import *

import sys
sys.path.append("/home/yara/Documents/PseudoFind")
from PseudoFind.pangenome_cmds import *
from PseudoFind.functional_annotations import *

%matplotlib inline

#1 Known_PACFgenes

In [2]:
SNPs_FT = pd.read_excel('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/indel_list.xlsx', skiprows = 5).fillna('')
del SNPs_FT['Unnamed: 0']
to_drop = SNPs_FT.loc[SNPs_FT['Type of mutation'] == 'Intergenic Deletion'].index.tolist() + SNPs_FT.loc[SNPs_FT['Type of mutation'] == 'Intergenic Insertion'].index.tolist() 
to_drop += SNPs_FT.loc[SNPs_FT['Gene locus tag'] == 'Gene locus tag'].index.tolist() + SNPs_FT.loc[SNPs_FT['Gene locus tag'] == ''].index.tolist()
SNPs_FT = SNPs_FT.drop(to_drop)

indel_FT = pd.read_excel('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/indel_list.xlsx', skiprows = 5)
del indel_FT['Unnamed: 0']
to_drop = indel_FT.loc[indel_FT['Type of mutation'] == 'Intergenic Deletion'].index.tolist() + indel_FT.loc[indel_FT['Type of mutation'] == 'Intergenic Insertion'].index.tolist() 
to_drop += indel_FT.loc[indel_FT['Gene locus tag'] == 'Gene locus tag'].index.tolist() + indel_FT.loc[indel_FT['Gene locus tag'] == ''].index.tolist()
indel_FT = indel_FT.drop(to_drop)

In [3]:
# all mutated genes (to get a more comprehensive list, are genes affected by convergent evolution only the only genes of interest?)
s1 = set(SNPs_FT.fillna('')['Gene locus tag'])
s1.update(set(indel_FT.fillna('')['Gene locus tag']))
locus_tags = {x.split('.')[0] for y in s1 for x in y.split('//') if x != ''}
pickle.dump(locus_tags, open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/all_mutated_genes.p', 'wb'))
# '\n'+FT.loc[locus,'translation']+'\n')

In [4]:
# pickle.dump(locus_tags, open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/list52.p', 'wb') )

In [5]:
# ['algD operon', 'mexX', 'mexY', 'vfr', 'exsA', 'rhlR', 'anr', 'phoP', 'cyaB', 'ampD', 'fleQ', 'rpoN', 'wspF'] from other publications

list52 = ['mexZ', 'mucA', 'vgrG', 'algU', 'PA2099', 'wbpM', 'bifA', 'morA', 'mexB', 'dnaX', 'pcoA', 'PA3290', 'betT', 'gyrB', 'mexR', 'mexA',  'rbdA', 'oprD', 'PA2455', 'nfxB', 'PA0977', 'lasR', 'mexS', 'pelA', 'gyrA',
'wspE', 'wspA', 'mpl', 'phzB1', 'PA4311', 'retS', 'aceE', 'aceF', 'pilQ', 'htrB', 'ykoM', 'pilD', 'htrB', 'ykoM', 'pilD', 'pdxY', 'yecS', 'PA1471', 'PA1677', 'pvdS', 'PA2490', 'PA2602', 'PA3222', 'nalD', 'PA3939',
'PA4642', 'PA4963', 'phaF', 'PA5177', 'cmpR']

locus_tags = dict(SNPs_FT.set_index('Gene name').loc[set(list52) & set(SNPs_FT['Gene name'])]['Gene locus tag'].drop_duplicates())
locus_tags.update(dict(indel_FT.set_index('Gene name').loc[set(list52) & set(indel_FT['Gene name'])]['Gene locus tag'].drop_duplicates()))
locus_tags.update({'gyrB':'PA0004', 'betT':'PA5375', 'yecS':'PA0313', 'vgrG':'PA0091', 'mexS':'PA2491', 'mexZ':'PA2020', 'htrB':'PA0011', 'rbdA':'PA0861'})
locus_tags = {y:x for x,y in locus_tags.items()}
locus_tags.update({x:'' for x in set(list52) &  set(SNPs_FT['Gene locus tag'])})

print("Couldn't map cmpR and ykoM to a locus tag")

FT = get_FT_from_gb('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/PAO1_1.gb').set_index('locus_tag')
with open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/list52genes.fa', 'w') as y:
    for locus in locus_tags:
        y.write('>'+locus+'\n'+FT.loc[locus,'translation']+'\n')

Couldn't map cmpR and ykoM to a locus tag


#2 cluster pangenome into gene families using CD-HIT

In [6]:
# FT_LRI = pd.read_csv('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/FT_LRI_f.csv', dtype = {'Genome ID':str}, index_col = ['Unnamed: 0'])
# fasta_files = [x for x in glob('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/prokka/*/*.faa') if x.split('/')[-1].split('.faa')[0] in list(FT_LRI['Genome ID'])]
# pangenome_directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/pseudomonas_pangenome.fa'
# threshold = 0.8
# run_cdhit(fasta_files, pangenome_directory, threshold)

KeyboardInterrupt: 

In [None]:
# # add metadata to each gene family

# cdhit_clusters = get_cdhit_clusters('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/pseudomonas_pangenome_cdhit.clstr')
# cdhit_clusters_rev = {y:x for x,z in cdhit_clusters.items() for y in z}

# gb_files = [x for x in glob('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/prokka/*/*.gb*') if x.split('/')[-1].split('.gb')[0] in list(FT_LRI['Genome ID'])]

# cdhit_metadata = {}
# for gb_file in gb_files:
    
#     FT = get_FT_from_gb(gb_file).set_index('locus_tag')

#     for index in FT.index:
#         cdhit_id = cdhit_clusters_rev[index]
#         if cdhit_id not in cdhit_metadata.keys():
#             cdhit_metadata[cdhit_id] = defaultdict(list)

#         for column in ['EC_number', 'gene', 'product', 'db_xref']:
#             cdhit_metadata[cdhit_id][column].append(FT.loc[index, column])

In [None]:
# cdhit_metadatar = {}
# for cdhit_id, metadata in cdhit_metadata.items():
#     product = list(OrderedDict(sorted(Counter(metadata['product']).items(), key = lambda a: a[1], reverse = True )).keys())[0]
#     if product == 'hypothetical protein' and len(set(metadata['product']) - {'hypothetical protein'}) != 0:
#         counter = Counter(metadata['product'])
#         del counter['hypothetical protein']
#         product = list(OrderedDict(sorted(counter.items(), key = lambda a: a[1], reverse = True )).keys())[0]
        
#     cdhit_metadatar[cdhit_id] = {
#         'EC_number':list(OrderedDict(sorted(Counter(metadata['EC_number']).items(), key = lambda a: a[1], reverse = True )).keys())[0],
#          'gene':list(OrderedDict(sorted(Counter(metadata['gene']).items(), key = lambda a: a[1], reverse = True )).keys())[0],
#          'product':product,
#          'db_xref':list(OrderedDict(sorted(Counter(metadata['db_xref']).items(), key = lambda a: a[1], reverse = True )).keys())[0]
#         }
    
# pickle.dump(cdhit_metadatar, open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/cdhit_metadatar.p', 'wb'))

#3 Functional annotation of pangenome and allele matrix (combination of CD-HIT, PROKKA annotations and EGGNOG)

In [9]:
# # map the 52 genes to the pan genome matrix
# cdhit_reps = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/pseudomonas_pangenome_cdhit'

# blast_directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/BR_results'
# list52 = ['/home/yara/Documents/cystic_fibrosis/data/pseudomonas/list52genes.fa']
# run_bidirectional_blast(cdhit_reps,list52,blast_directory)

# all_mutated = ['/home/yara/Documents/cystic_fibrosis/data/pseudomonas/all_mutated_genes.fa']
# run_bidirectional_blast(cdhit_reps,all_mutated,blast_directory)

Blasting: list52genes...
Blasting: all_mutated_genes...


defaultdict(dict, {})

In [10]:
# with open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/all_mutated_genes.fa', 'r') as f:
#     string = f.read()
# gene2l = {x.split('\n')[0].replace('>',''):len(''.join(x.split('\n')[1:])) for x in string.split('\n>')}

# with open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/list52genes.fa', 'r') as f:
#     string = f.read()
# gene2l.update({x.split('\n')[0].replace('>',''):len(''.join(x.split('\n')[1:])) for x in string.split('\n>')})

# blast_directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/BR_results'
# PID_threshold = 80
# blast_hits = {}

# BR2_out = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/BR_results/BR2/all_mutated_genes'
# gid = BR2_out.split('/')[-1]
# BR2 = pd.read_table(BR2_out, names= ['gene', 'subject','PID', 'alnLength', 'mismatchCount', 'gapOpenCount', 'queryStart', 'queryEnd', 'subjectStart', 'subjectEnd', 'eVal', 'bitScore'])
# BR2['gene_length'] = [gene2l[BR2.loc[index, 'gene']] for index in BR2.index]
# BR2['aln_perc'] = BR2['alnLength']/BR2['gene_length']
# BR2 = BR2.set_index('gene')
# BR2 = BR2.loc[(BR2['PID'] > PID_threshold) & (BR2['eVal'] < 0.000001) & (BR2['aln_perc'] >= 0.4)] # you can change your threshold here
# blast_hits = {gene: set(BR2.loc[[gene]]['subject']) for gene in BR2.index}
    
# pickle.dump(blast_hits, open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/BR_results/blast_hits.p', 'wb'))

#4 Map the allele matrix to the pan genome matrix

In [11]:
# pangenome_directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/pseudomonas_pangenome.fa'
# cdhit_directory = pangenome_directory.split('.f')[0]+'_cdhit.clstr'
# cdhit_clusters = get_cdhit_clusters(cdhit_directory)
# cdhit_clusters_rev = {y:x for x,z in cdhit_clusters.items() for y in z}

# output_directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/allele_matrix/'
# locus2allele = pickle.load(open('%s/locus2allele.p'%output_directory, 'rb'))
# allele2cdhit = {allele_id:cdhit_clusters_rev[gene_id] for gene_id, allele_id in locus2allele.items()}
# pickle.dump(allele2cdhit, open('%s/allele2cdhit.p'%output_directory, 'wb'))

#5 Deep functional annotation of selected features - restart from here

In [6]:
selected_features_final = pd.read_csv('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/machine_learning/selected_features_final_trial_2.csv').rename(columns = {'Unnamed: 0':'Allele ID'})
selected_features_final['Allele ID'] = ['Allele_%d'%i for i in selected_features_final['Allele ID']]
output_directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/allele_matrix/'
allele2cdhit = pickle.load(open('%s/allele2cdhit.p'%output_directory, 'rb'))

all_alleles = pickle.load(open('%s/all_alleles_seqs.p'%output_directory, 'rb'))
all_alleles_rev = {y:x for x,y in all_alleles.items()}

selected_features_final['CD-hit ID'] = [allele2cdhit[c] for c in selected_features_final['Allele ID']]
selected_features_final['Allele sequence'] = [all_alleles_rev[x] for x in selected_features_final['Allele ID']]

with open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/machine_learning/selected_features_final.fa', 'w') as y:
    for index in selected_features_final.index:
        y.write('>'+selected_features_final.loc[index, 'Allele ID'] + '\n' + selected_features_final.loc[index, 'Allele sequence'] + '\n')
        
        
cdhit_metadatar = pickle.load(open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/cdhit_metadatar.p', 'rb'))
cdhit_metadatar = pd.DataFrame(cdhit_metadatar).T
selected_features_final = selected_features_final.merge(cdhit_metadatar, left_on = 'CD-hit ID', right_index = True, how = 'left')
selected_features_final.head(2)

Unnamed: 0,Allele ID,score,CD-hit ID,Allele sequence,EC_number,db_xref,gene,product
0,Allele_52237,-546.984005,Cluster 3757,MRHLLLRHEALDAEGFAAQLAGTPGEVAQAILGAAREGLTEAQALL...,,,esiB,Secretory immunoglobulin A-binding protein EsiB
1,Allele_242087,513.546898,Cluster 1528,MALFDEQVHDSRAWFMNTSAIGPREPFTDYFRYRLVHFDNESNKRL...,,,,hypothetical protein


In [7]:
selected_features_final

Unnamed: 0,Allele ID,score,CD-hit ID,Allele sequence,EC_number,db_xref,gene,product
0,Allele_52237,-546.984005,Cluster 3757,MRHLLLRHEALDAEGFAAQLAGTPGEVAQAILGAAREGLTEAQALL...,,,esiB,Secretory immunoglobulin A-binding protein EsiB
1,Allele_242087,513.546898,Cluster 1528,MALFDEQVHDSRAWFMNTSAIGPREPFTDYFRYRLVHFDNESNKRL...,,,,hypothetical protein
2,Allele_273742,-428.705236,Cluster 15711,MNETVSLETREVRFLSVFDFDGTLTYRDSFVPFLRFAFGNRVFVRR...,3.1.3.3,,serB,Phosphoserine phosphatase
3,Allele_276630,-365.654298,Cluster 10034,MHSLRFFSNAEVAERLSYPQLIEALRIGLAKPCSAPLRSCHALPAQ...,1.5.1.49,COG:COG2423,,Delta(1)-pyrroline-2-carboxylate reductase
4,Allele_55447,-347.360962,Cluster 9391,MSHALRAVFLDHASLDLGDLDMRPLRAAFDELQLHTASQPQEVAAR...,1.1.1.29,COG:COG1052,hprA,Glycerate dehydrogenase
5,Allele_293487,-307.213499,Cluster 11115,MRALAEFTMRGRMQATLAVAGSAVVPLLFWLSAAAGSLVLLRRGFG...,,,,hypothetical protein
6,Allele_253486,-290.144916,Cluster 19471,MEVVALALALAAYLGLAAACLWLRRELRRLQAALAQQAEEGAARER...,,,cpoB_2,Cell division coordinator CpoB
7,Allele_266708,-262.603769,Cluster 680,MTTQQLLTPHQSQYFAWLLTRRAAGDSVESLASTLVDSQVDLNPHQ...,3.6.4.-,,rapA_3,RNA polymerase-associated protein RapA
8,Allele_240337,-262.421041,Cluster 2695,MNALLALPDLQPERLFVQSGDVRLAVHCWGAPDNDKPTLLMVHGYP...,1.-.-.-,COG:COG1028,ephD,putative oxidoreductase EphD
9,Allele_248932,257.877684,Cluster 1241,MREKQESGSVPVPAEFMSAQSAIVGLRGKDLLTTVRSLAVHGLRQP...,2.3.1.-,COG:COG3243,phaC_2,Poly(3-hydroxyalkanoate) polymerase subunit PhaC


#5a VFDB

In [8]:
directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation'
output_fa,output_ffn, output_csv = '%s/VFDB_pseudomonas.fa'%directory, '%s/VFDB_pseudomonas.ffn'%directory, '%s/VFDB_pseudomonas.csv'%directory
species_name = 'pseudomonas'
get_VFDB_species(species_name, output_fa = output_fa, output_ffn = output_ffn, output_csv = output_csv)


selected_features_fa = ['/home/yara/Documents/cystic_fibrosis/data/pseudomonas/machine_learning/selected_features_final.fa']
blast_directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/BR_results'
run_bidirectional_blast(output_fa, selected_features_fa, blast_directory)

PID_threshold = 80
output_file_name = 'SF_VFDB'
read_blast_results(blast_directory, PID_threshold, output_file_name)

bbh = pickle.load(open('%s/%s.p'%(blast_directory, output_file_name), 'rb'))
selected_features_final['VFDB hit'] = [bbh['selected_features_final'][x] if x in bbh['selected_features_final'].keys() else '' for x in selected_features_final['Allele ID']]

VFDB_met = pd.read_csv(output_csv)
VFDB_met = VFDB_met.rename(columns = {col:'%s(VFDB)'%col for col in VFDB_met.columns})
selected_features_final = selected_features_final.merge(VFDB_met, left_on = 'VFDB hit', right_on = 'gene_id(VFDB)', how = 'left')
selected_features_final.head(2)



A total of 2917 genes were extracted for pseudomonas
A total of 9 entries were excluded
Blasting: selected_features_final...


Unnamed: 0,Allele ID,score,CD-hit ID,Allele sequence,EC_number,db_xref,gene,product,VFDB hit,VF_id(VFDB),description(VFDB),gene_id(VFDB),gene_name(VFDB),nucleotide_sequence(VFDB),organism(VFDB)
0,Allele_52237,-546.984005,Cluster 3757,MRHLLLRHEALDAEGFAAQLAGTPGEVAQAILGAAREGLTEAQALL...,,,esiB,Secretory immunoglobulin A-binding protein EsiB,,,,,,,
1,Allele_242087,513.546898,Cluster 1528,MALFDEQVHDSRAWFMNTSAIGPREPFTDYFRYRLVHFDNESNKRL...,,,,hypothetical protein,,,,,,,


In [19]:
selected_features_final = selected_features_final.fillna('')
selected_features_final.loc[selected_features_final['gene_id(VFDB)']!=''].shape[0]

2

#5b EGGNOG

In [9]:
# cols = ['query_name', 'seed eggNOG ortholog', 'seed ortholog evalue', 'seed ortholog score', 'Predicted taxonomic group', 'Predicted protein name',
# 'Gene Ontology terms' ,'EC number', 'KEGG_ko', 'KEGG_Pathway', 'KEGG_Module', 'KEGG_Reaction', 'KEGG_rclass','BRITE', 'KEGG_TC', 'CAZy' , 'BiGG Reaction', 
# 'tax_scope: eggNOG taxonomic level used for annotation', 'eggNOG OGs', 'bestOG (deprecated, use smallest from eggnog OGs)', 'COG Functional Category',
# 'eggNOG free text description']

# eggnog = pd.read_csv('%s/selected_features_final.fa.emapper.annotations'%directory, skiprows = 4, sep = '\t', names = cols).fillna('')

In [10]:
# eggnog_file = '%s/selected_features_final.fa.emapper_o2.annotations'%directory
# eggnog = get_eggnog_n(eggnog_file)
# selected_features_final = selected_features_final.merge(eggnog, left_on = 'Allele ID', right_on = 'query_name', how = 'left')
# selected_features_final.head(2)

Unnamed: 0,Allele ID,score,CD-hit ID,Allele sequence,EC_number,db_xref,gene,product,VFDB hit,VF_id(VFDB),...,KEGG_TC,CAZy,BiGG Reaction,tax_scope: eggNOG taxonomic level used for annotation,eggNOG OGs,"bestOG (deprecated, use smallest from eggnog OGs)",COG Functional Category,eggNOG free text description,Cog cat description,COG (grouped)
0,Allele_242087,109.167437,Cluster 1528,MALFDEQVHDSRAWFMNTSAIGPREPFTDYFRYRLVHFDNESNKRL...,,,,hypothetical protein,,,...,,,,,,,,,,
1,Allele_52237,-108.561045,Cluster 3757,MRHLLLRHEALDAEGFAAQLAGTPGEVAQAILGAAREGLTEAQALL...,,,esiB,Secretory immunoglobulin A-binding protein EsiB,,,...,,,,,,,,,,


In [37]:
selected_features_final.loc[selected_features_final['COG (grouped)'] != '']

Unnamed: 0,Allele ID,score,CD-hit ID,Allele sequence,EC_number,db_xref,gene,product,VFDB hit,VF_id(VFDB),...,Cog cat description,COG (grouped),Reaction(s),Subsystem(s),locus_tag,PA14 locus (bartell),Product (bartell),Category (bartell),Gene (bartell),allele_nucleotide_sequence


#5c metabolism (bartell)

In [9]:
PA14_model = cobra.io.load_json_model('%s/iPAU1129.json'%directory)

FT = get_FT_from_gb('%s/PA14.gb'%directory).fillna('').set_index('old_locus_tag')
FT = FT.drop([''])
with open('%s/PA14.fa'%directory, 'w') as y:
    for index in FT.index:
        if FT.loc[index, 'translation'] != '':
            y.write('>'+index+'\n'+FT.loc[index, 'translation'] + '\n')
        
selected_features_fa = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/machine_learning/selected_features_final.fa'
blast_directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/BR_results'
output_fa = ['%s/PA14.fa'%directory]
run_bidirectional_blast(selected_features_fa, output_fa, blast_directory)

PID_threshold = 80
output_file_name = 'SF_PA14'
read_blast_results(blast_directory, PID_threshold, output_file_name)

bbh = pickle.load(open('%s/%s.p'%(blast_directory, output_file_name), 'rb'))

Academic license - for non-commercial use only
Blasting: PA14...


In [10]:
gene_bartell_metadata = pd.DataFrame([{
                    'locus_tag':gene.id, 
                     'Subsystem(s)':' AND '.join({rx.subsystem for rx in gene.reactions}),
                      'Reaction(s)':' AND '.join({rx.name for rx in gene.reactions}),
                    'Allele ID':bbh['PA14'][gene.id]
                                 }
        for gene in PA14_model.genes if gene.id in bbh['PA14'].keys()
])
selected_features_final = selected_features_final.merge(gene_bartell_metadata, on = 'Allele ID', how = 'left')

In [11]:
VF_bartell = pd.read_excel('%s/bartell_virulence_genes.xlsx'%directory, sheet_name = 'Virulence-linked genes')
VF_bartell = VF_bartell[['PA14 locus', 'Product', 'Category', 'Gene']]
VF_bartell = VF_bartell.rename(columns = {col:'%s (bartell)'%col for col in VF_bartell.columns})
VF_bartell['Allele ID'] = [bbh['PA14'][x] if x in bbh['PA14'].keys() else '' for x in VF_bartell['PA14 locus (bartell)']]
VF_bartell = VF_bartell.fillna('')
VF_bartell = VF_bartell.loc[VF_bartell['Allele ID'] != '']
selected_features_final = selected_features_final.merge(VF_bartell, on = 'Allele ID', how = 'left').fillna('')

# add nucleotide sequences

In [13]:
directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/'
locus2allele = pickle.load(open('%s/allele_matrix/locus2allele.p'%directory, 'rb'))
locus2allele_rev = {y:x for x,y in locus2allele.items()}
for index in selected_features_final.index:
    allele_id = selected_features_final.loc[index, 'Allele ID']
    locus_tag = locus2allele_rev[allele_id]
    ffn_file = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/prokka/%s/%s.ffn'%(locus_tag.split('_')[0], locus_tag.split('_')[0])
    with open(ffn_file, 'r') as f:
        string = f.read()
    gene_seqs = {x.split('\n')[0].replace('>','').split(' ')[0]:''.join(x.split('\n')[1:]) for x in string.split('\n>')}    
    selected_features_final.loc[index, 'allele_nucleotide_sequence'] = gene_seqs[locus_tag]

In [14]:
selected_features_final.to_csv('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation/selected_features_final_m.csv')

# metabolism

In [47]:
selected_features_final  = selected_features_final.drop_duplicates(subset = 'Allele ID')

In [24]:
100*selected_features_final.loc[selected_features_final['Gene (bartell)'] != ''].shape[0]/selected_features_final.shape[0]

3.3333333333333335

### virulence

In [25]:
cols = ['Allele ID','score','gene', 'product','gene_id(VFDB)', 'description(VFDB)', 'Product (bartell)', 'Category (bartell)', 'Gene (bartell)']
indices_virulence = set(selected_features_final.loc[selected_features_final['VF_id(VFDB)'] != ''].index.tolist() + selected_features_final.loc[selected_features_final['PA14 locus (bartell)'] != ''].index.tolist())
VF_matches = selected_features_final.loc[indices_virulence][cols].drop_duplicates()
print(len(VF_matches))
VF_matches

3


Unnamed: 0,Allele ID,score,gene,product,gene_id(VFDB),description(VFDB),Product (bartell),Category (bartell),Gene (bartell)
49,Allele_281963,93.761871,flgG,Flagellar basal-body rod protein FlgG,VFG014215(gi:116049011),Flagella,flagellar basal-body rod protein FlgG,Flagella,flgG
19,Allele_240348,155.789629,,hypothetical protein,,,probable chemotaxis transducer,,
53,Allele_251176,86.136661,,hypothetical protein,VFG041014(gi:15596853),HSI-2,HsiA2,Hcp secretion island-2 encoded type VI secreti...,hsiA2


In [17]:
cols = ['Allele ID','score','gene', 'product','gene_id(VFDB)', 'description(VFDB)', 'Product (bartell)', 'Category (bartell)', 'Gene (bartell)']
indices_virulence = set(selected_features_final.loc[selected_features_final['VF_id(VFDB)'] != ''].index.tolist() + selected_features_final.loc[selected_features_final['PA14 locus (bartell)'] != ''].index.tolist())
VF_matches = selected_features_final.loc[indices_virulence][cols].drop_duplicates()
print(len(VF_matches))
VF_matches

25


Unnamed: 0,Allele ID,score,gene,product,gene_id(VFDB),description(VFDB),Product (bartell),Category (bartell),Gene (bartell)
130,Allele_47015,8.277586,gspD,Putative secretin GspD,VFG040877(gi:15595882),hxc,,,
132,Allele_96891,8.06237,,Calmodulin-sensitive adenylate cyclase,VFG019980(gi:218891856),P. aeruginosa TTSS translocated effectors,adenylate cyclase ExoY,T3SS translocated effectors,exoY
135,Allele_2126,-7.901473,pscG,Type III export protein PscG,VFG000215(gb|NP_250411),TTSS,type III export protein PscG,T3SS,pscG
139,Allele_287919,-7.696631,sasA_17,Adaptive-response sensory-kinase SasA,VFG013996(gi:116052692),Type IV pili biosynthesis,two-component sensor PilS,Type IV pili biosynthesis,pilS
25,Allele_240348,29.318669,,hypothetical protein,,,probable chemotaxis transducer,,
162,Allele_313096,-6.662285,fhuA_3,Ferrichrome outer membrane transporter/phage r...,VFG016094(gi:152984528),Pyoverdine receptors,,,
37,Allele_281963,24.162281,flgG,Flagellar basal-body rod protein FlgG,VFG014223(gi:77457729),Flagella,flagellar basal-body rod protein FlgG,Flagella,flgG
176,Allele_258463,-6.014297,,hypothetical protein,VFG019841(gi:218892686),Flagella,putative flagellar hook-length control protein...,Flagella,
185,Allele_154447,-5.389168,mucB,Sigma factor AlgU regulatory protein MucB,VFG014920(gi:152986342),Alginate regulation,anti-sigma factor MucA,Alginate regulation,mucA
188,Allele_56336,5.34334,vgrG1_4,Actin cross-linking toxin VgrG1,VFG041045(gi:15597569),HSI-3,VgrG3,Hcp secretion island-3 encoded type VI secreti...,vgrG3


### transport

In [19]:
selected_features_final = pd.read_csv('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation/selected_features_final_m.csv', index_col = ['Unnamed: 0']).fillna('')
cols = ['Allele ID','score','gene', 'product', 'Product (bartell)', 'Category (bartell)', 'Reaction(s)','Subsystem(s)']
indices = {index for index in selected_features_final.index if 'transport' in selected_features_final.loc[index, 'product'].lower() 
           or 'transport' in selected_features_final.loc[index, 'Subsystem(s)'].lower()
          or 'porin' in selected_features_final.loc[index, 'product'].lower()
          or 'efflux' in selected_features_final.loc[index, 'product'].lower()}

selected_features_final.loc[indices][cols]

Unnamed: 0,Allele ID,score,gene,product,Product (bartell),Category (bartell),Reaction(s),Subsystem(s)
66,Allele_50314,16.152412,yijE_1,putative cystine transporter YijE,,,,
35,Allele_220802,-24.54356,,Zinc transport protein ZntB,,,,
162,Allele_313096,-6.662285,fhuA_3,Ferrichrome outer membrane transporter/phage r...,,,,
195,Allele_29891,5.127276,ttgC,putative efflux pump outer membrane protein TtgC,,,,
108,Allele_51047,-9.914613,dctB_1,C4-dicarboxylate transport sensor protein DctB,,,,
140,Allele_47703,-7.641307,ompP1,Outer membrane protein P1,,,fatty-acid--CoA ligase (n-C14:0) transport AND...,Transport
155,Allele_239678,6.961286,yhhQ,Queuosine precursor transporter,,,,
17,Allele_265363,-39.275609,metQ_2,D-methionine-binding lipoprotein MetQ,,,L-methionine S-oxide transport via ABC system ...,Transport
18,Allele_250942,35.579943,livF_1,High-affinity branched-chain amino acid transp...,,,4-Aminobutyrate transport via ABC system AND L...,Transport
49,Allele_8359,20.694556,,hypothetical protein,,,malonate transport,Transport


In [167]:
# cols = ['score_x','gene', 'product', 'description', 'description(VFDB)', 'Cog cat description', 'COG (grouped)', 'Reaction(s)', 'Subsystem(s)']
# indices_metabolism = set(selected_features_final.loc[selected_features_final['COG (grouped)'] == 'Metabolism'].index.tolist() + selected_features_final.loc[selected_features_final['Reaction(s)'] != ''].index.tolist())
# print(len(indices_metabolism -indices_virulence))
# SF_metabolism = selected_features_final.loc[indices_metabolism - indices_virulence][cols].sort_values(by = 'score_x')
# Counter(SF_metabolism['Cog cat description'])

60


Counter({' Coenzyme transport and metabolism': 4,
         'I': 5,
         'Q': 1,
         'amino acid metabolism and transport': 18,
         'carbohydrate metabolism and transport': 6,
         'cell wall structure and biogenesis': 2,
         'energy production and conversion': 8,
         'inorganic ion transport and metabolism': 11,
         'molecular chaperones and related functions': 1,
         'no functional prediction': 1,
         'nucleotide metabolism': 3})

In [24]:
# map the 52 genes to the pan genome matrix
with open('%s/functional_annotation/selected_features_final.ffn'%directory, 'w') as y:
    for index in selected_features_final.index:
        y.write('>'+selected_features_final.loc[index, 'Allele ID']+'\n'+ selected_features_final.loc[index, 'allele_nucleotide_sequence']+'\n')
        
reference = '%s/functional_annotation/TCH1516.ffn'%directory
with open(reference, 'r') as f:
    string = f.read()
gene_seqs = {re.findall('USA300HOU_RS[0-9]+', x)[0]:''.join(x.split('\n')[1:]) for x in string.split('\n>')}

reference = reference.replace('.ffn','_m.ffn')
with open(reference, 'w') as y:
    for gene, seq in gene_seqs.items():
        y.write('>'+gene+'\n'+seq+'\n')

features = '%s/functional_annotation/selected_features_final.ffn'%directory

blast_directory = '%s/pangenome/BR_results'%directory

makeblastdb = 'makeblastdb -in %s -parse_seqids -dbtype nucl'%features
print(os.system(makeblastdb))

makeblastdb = 'makeblastdb -in %s -parse_seqids -dbtype nucl'%reference
print(os.system(makeblastdb))

BR1_out = '%s/BR1/%s'%(blast_directory, 'TCH1516')
blastp = 'blastn -query %s -db %s -outfmt 6 -out %s'%(reference, features, BR1_out)
print(os.system(blastp))

BR2_out =  '%s/BR2/%s'%(blast_directory, 'TCH1516')
blastp = 'blastn -query %s -db %s -outfmt 6 -out %s'%(features, reference, BR2_out)
print(os.system(blastp))

PID_threshold = 50
output_file_name = 'TCH1516'
read_blast_results(blast_directory, PID_threshold, output_file_name)
bbh = pickle.load(open('%s/%s.p'%(blast_directory,output_file_name),'rb'))
set(bbh['TCH1516'].keys()) - set(selected_features['Allele ID'])

FileNotFoundError: [Errno 2] No such file or directory: '/home/yara/Documents/cystic_fibrosis/data/pseudomonas//functional_annotation/TCH1516.ffn'

In [29]:
# PMC4854172

selected_features_final = selected_features_final.fillna('')

matches = []
cols = ['gene', 'product', 'Reaction(s)', 'Subsystem(s)','Product (bartell)', 'Category (bartell)','Gene (bartell)']

for col in cols:
    for index in selected_features_final.index:
        for gene in literature_rev.keys():
            if gene.lower() in str(selected_features_final.loc[index, col]).lower():
                matches.append({'index':index, 'key word': str(selected_features_final.loc[index, col]).lower(), 'CFPA gene':gene,
                               'category':literature_rev[gene]})
                
                
matches_df = pd.DataFrame(matches)
matches_df = matches_df.drop_duplicates('index')
matches_df = matches_df.set_index('CFPA gene').drop('mpl').reset_index()
for cat in set(matches_df['category']):
    print(cat,': ', ', '.join(set(matches_df.loc[matches_df['category'] == cat]['CFPA gene'])))
    print(cat,': ', '---------- '.join(set(matches_df.loc[matches_df['category'] == cat]['key word'])))
    print('-------------')
    
# matches_df.to_csv('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation/literature_matches_df.csv')

NameError: name 'literature_rev' is not defined

#5e check for overlaps with natgen

In [203]:
len(locus_tags)

50

In [102]:
selected_features_fa = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/machine_learning/selected_features_final.fa'
output_fa = ['/home/yara/Documents/cystic_fibrosis/data/pseudomonas/all_mutated_genes.fa']
blast_directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/BR_results'
run_bidirectional_blast(selected_features_fa, output_fa, blast_directory)

PID_threshold = 80
output_file_name = 'SF_natgen_genes'
read_blast_results(blast_directory, PID_threshold, output_file_name)

bbh = pickle.load(open('%s/%s.p'%(blast_directory, output_file_name), 'rb'))

Blasting: all_mutated_genes...


In [103]:
len(bbh['all_mutated_genes'])/selected_features_final.shape[0]

0.17370892018779344

In [48]:
len(bbh['all_mutated_genes'])/selected_features_final.shape[0]

0.22169811320754718

In [69]:
natgen_all = pd.DataFrame([{'list52 locus':x, 'CD-hit ID':cdhit_clusters_rev[y]} for x,y in bbh['all_mutated_genes'].items()])
columns = ['gene', 'product', 'description(VFDB)', 'Reaction(s)']
selected_features_final.set_index('CD-hit ID').loc[set(natgen_all['CD-hit ID']) & set(selected_features_final['CD-hit ID'])][columns]

Unnamed: 0_level_0,gene,product,description(VFDB),Reaction(s)
CD-hit ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cluster 4540,ttgC,putative efflux pump outer membrane protein TtgC,,
Cluster 1033,,hypothetical protein,,
Cluster 7735,btuD_3,Vitamin B12 import ATP-binding protein BtuD,,molybdate transport via ABC system
Cluster 3081,ompP1,Outer membrane protein P1,,fatty-acid--CoA ligase (n-C14:0) transport AND...
Cluster 1217,cadA_1,Cadmium-transporting ATPase,,Copper export via ATPase AND Copper transport ...
Cluster 9391,hprA,Glycerate dehydrogenase,,glycerate dehydrogenase
Cluster 5398,sasA_2,Adaptive-response sensory-kinase SasA,,
Cluster 12339,,Sensor histidine kinase LiaS,,
Cluster 9633,lipR,Putative acetyl-hydrolase LipR,,
Cluster 795,vgrG1_2,Actin cross-linking toxin VgrG1,,


In [53]:
list52met = pd.DataFrame([{'list52 locus':x, 'CD-hit ID':cdhit_clusters_rev[y], 'Gene name': locus_tags[x]} for x,y in bbh['list52genes'].items()])
list52met.set_index('CD-hit ID').loc[set(list52met['CD-hit ID']) & set(selected_features_final['CD-hit ID'])]

Unnamed: 0_level_0,Gene name,list52 locus
CD-hit ID,Unnamed: 1_level_1,Unnamed: 2_level_1
Cluster 1528,,PA3290


In [58]:
list52met = pd.DataFrame([{'list52 locus':x, 'CD-hit ID':cdhit_clusters_rev[y], 'Gene name': locus_tags[x]} for x,y in bbh['list52genes'].items()])
list52met.set_index('CD-hit ID').loc[set(list52met['CD-hit ID']) & set(selected_features_final['CD-hit ID'])]

Unnamed: 0_level_0,Gene name,list52 locus
CD-hit ID,Unnamed: 1_level_1,Unnamed: 2_level_1
Cluster 716,pelA,PA3064
Cluster 1542,,PA3290


In [42]:
covariance = np.cov(bootstrap_res.loc[abs(bootstrap_res).sum(axis = 1).sort_values(ascending = False).index.tolist()[:300]])
max([max(x) for x in covariance])

0.07781107597901471

In [25]:
bootstrap_rows = pickle.load(open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/machine_learning/bootstrap_rows_SVC_trial_2.p', 'rb'))
bootstrap_res = pd.DataFrame(bootstrap_rows).fillna(0)
sorted_features = abs(bootstrap_res).sum(axis = 1).sort_values(ascending = False)

sorted_features = pd.DataFrame(sorted_features, columns = [ 'score']).reset_index().rename(columns = {'index':'Allele ID'})
sorted_features['Allele ID'] = ['Allele_%d'%i for i in sorted_features['Allele ID']]
sorted_features['CD-hit ID'] = [allele2cdhit[c] for c in sorted_features['Allele ID']]
cdhit_metadatar = pickle.load(open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/cdhit_metadatar.p', 'rb'))
cdhit_metadatar = pd.DataFrame(cdhit_metadatar).T
sorted_features = sorted_features.merge(cdhit_metadatar, left_on = 'CD-hit ID', right_index = True, how = 'left').fillna('')
sorted_features.head(3)

Unnamed: 0,Allele ID,score,CD-hit ID,EC_number,db_xref,gene,product
0,Allele_240059,117.822191,Cluster 8383,,,,hypothetical protein
1,Allele_247304,112.393665,Cluster 11768,,COG:COG4175,ousV,Glycine betaine/choline transport system ATP-b...
2,Allele_22282,87.425383,Cluster 1189,,,,hypothetical protein


In [26]:
sorted_features.loc[[index for index in sorted_features.index if 'mex' in sorted_features.loc[index, 'gene']]]

Unnamed: 0,Allele ID,score,CD-hit ID,EC_number,db_xref,gene,product
442,Allele_94113,2.062471,Cluster 7290,,,mexA_2,Multidrug resistance protein MexA
949,Allele_248141,0.340296,Cluster 7017,,,mexA_1,Multidrug resistance protein MexA
1437,Allele_257152,0.089151,Cluster 454,,COG:COG0841,mexB_1,Multidrug resistance protein MexB
1746,Allele_25203,0.033741,Cluster 7017,,,mexA_1,Multidrug resistance protein MexA


In [None]:
cdhit_directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/pseudomonas_pangenome_cdhit.clstr'
cdhit_clusters = get_cdhit_clusters(cdhit_directory)
cdhit_clusters_rev = {y:x for x,z in cdhit_clusters.items() for y in z}

blast_hits = pickle.load(open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/BR_results/blast_hits.p', 'rb'))
list52_generous = [cdhit_clusters_rev[x] for key, val in blast_hits.items() for ref_gene, mapped_genes in val.items() for x in mapped_genes]

output_directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/allele_matrix/'
allele2cdhit = pickle.load(open('%s/allele2cdhit.p'%output_directory, 'rb'))
allele2cdhit_rev = defaultdict(list)
for allele, cdhit in allele2cdhit.items():
    allele2cdhit_rev[cdhit].append(allele)
    
SF_cdhit = {allele2cdhit['Allele_%d'%x] for x in selected_features_final.index}
len(SF_cdhit & set(list52_generous))/len(SF_cdhit)

#5e maybe some manual curations here and there?

In [36]:
SNPs_FT = pd.read_excel('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/indel_list.xlsx', skiprows = 5).fillna('')
del SNPs_FT['Unnamed: 0']
to_drop = SNPs_FT.loc[SNPs_FT['Type of mutation'] == 'Intergenic Deletion'].index.tolist() + SNPs_FT.loc[SNPs_FT['Type of mutation'] == 'Intergenic Insertion'].index.tolist() 
# to_drop += SNPs_FT.loc[SNPs_FT['Gene locus tag'] == 'Gene locus tag'].index.tolist() + SNPs_FT.loc[SNPs_FT['Gene locus tag'] == ''].index.tolist()
# SNPs_FT = SNPs_FT.drop(to_drop)

# indel_FT = pd.read_excel('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/indel_list.xlsx', skiprows = 5)
# del indel_FT['Unnamed: 0']
# to_drop = indel_FT.loc[indel_FT['Type of mutation'] == 'Intergenic Deletion'].index.tolist() + indel_FT.loc[indel_FT['Type of mutation'] == 'Intergenic Insertion'].index.tolist() 
# to_drop += indel_FT.loc[indel_FT['Gene locus tag'] == 'Gene locus tag'].index.tolist() + indel_FT.loc[indel_FT['Gene locus tag'] == ''].index.tolist()
# indel_FT = indel_FT.drop(to_drop)

Unnamed: 0,Chromosome,Position in chromosome,Unnamed: 3,Insertion/Deletion,Mutated allele,Position of mutation in chromosome,Length of indel,Type of mutation,Gene locus tag,Gene name,...,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48
3,gi|110645304|ref|NC_002516.2|,74465,*,+CC,QRY,74465-74466ins(CC),2,Insertion,PA0061,,...,,,,,,,,,,
4,gi|110645304|ref|NC_002516.2|,74465,*,+C,QRY,74465-74466ins(C),1,Insertion,PA0061,,...,,,,,,,,,,
5,gi|110645304|ref|NC_002516.2|,74465,*,-C,QRY,74466-74466delta(C),1,Deletion,PA0061,,...,,,,,,,,,,
6,gi|110645304|ref|NC_002516.2|,100899,*,+G,QRY,100899-100900ins(G),1,Insertion,PA0082,TssA1,...,,,,,,,,,,
7,gi|110645304|ref|NC_002516.2|,119434,*,-G,QRY,119435-119435delta(G),1,Deletion,PA0098,,...,,,,,,,,,,
8,gi|110645304|ref|NC_002516.2|,225590,*,-G,QRY,225591-225591delta(G),1,Deletion,PA0195.1,pntAB,...,,,,,,,,,,
10,gi|110645304|ref|NC_002516.2|,246679,*,-G,QRY,246680-246680delta(G),1,Deletion,PA0218,,...,,,,,,,,,,
12,gi|110645304|ref|NC_002516.2|,309627,*,-G,QRY,309628-309628delta(G),1,Deletion,PA0274,,...,,,,,,,,,,
13,gi|110645304|ref|NC_002516.2|,325100,*,+G,QRY,325100-325101ins(G),1,Insertion,PA0289,gpuR,...,,,,,,,,,,
14,gi|110645304|ref|NC_002516.2|,401173,*,-C,QRY,401174-401174delta(C),1,Deletion,PA0357,mutM,...,,,,,,,,,,


In [28]:
selected_features_final

Unnamed: 0,Allele ID,score,CD-hit ID,Allele sequence,EC_number,db_xref,gene,product,VFDB hit,VF_id(VFDB),...,nucleotide_sequence(VFDB),organism(VFDB),Reaction(s),Subsystem(s),locus_tag,PA14 locus (bartell),Product (bartell),Category (bartell),Gene (bartell),allele_nucleotide_sequence
0,Allele_52237,-546.984005,Cluster 3757,MRHLLLRHEALDAEGFAAQLAGTPGEVAQAILGAAREGLTEAQALL...,,,esiB,Secretory immunoglobulin A-binding protein EsiB,,,...,,,,,,,,,,ATGCGCCACCTGCTGCTTCGTCACGAAGCGCTGGACGCCGAAGGTT...
1,Allele_242087,513.546898,Cluster 1528,MALFDEQVHDSRAWFMNTSAIGPREPFTDYFRYRLVHFDNESNKRL...,,,,hypothetical protein,,,...,,,,,,,,,,GTGGCGCTGTTCGACGAACAGGTGCACGACTCCCGCGCCTGGTTCA...
2,Allele_273742,-428.705236,Cluster 15711,MNETVSLETREVRFLSVFDFDGTLTYRDSFVPFLRFAFGNRVFVRR...,3.1.3.3,,serB,Phosphoserine phosphatase,,,...,,,,,,,,,,ATGAACGAGACCGTATCCCTCGAAACACGCGAAGTGCGCTTTCTCT...
3,Allele_276630,-365.654298,Cluster 10034,MHSLRFFSNAEVAERLSYPQLIEALRIGLAKPCSAPLRSCHALPAQ...,1.5.1.49,COG:COG2423,,Delta(1)-pyrroline-2-carboxylate reductase,,,...,,,,,,,,,,ATGCACAGCCTGCGCTTTTTCAGCAACGCCGAGGTCGCCGAGCGCC...
4,Allele_55447,-347.360962,Cluster 9391,MSHALRAVFLDHASLDLGDLDMRPLRAAFDELQLHTASQPQEVAAR...,1.1.1.29,COG:COG1052,hprA,Glycerate dehydrogenase,,,...,,,glycerate dehydrogenase,"Glycine, serine and threonine metabolism",PA14_61210,,,,,ATGAGCCATGCCCTGCGCGCCGTTTTCCTCGACCATGCCTCCCTCG...
5,Allele_293487,-307.213499,Cluster 11115,MRALAEFTMRGRMQATLAVAGSAVVPLLFWLSAAAGSLVLLRRGFG...,,,,hypothetical protein,,,...,,,,,,,,,,ATGCGCGCACTGGCTGAGTTCACCATGCGCGGCCGCATGCAGGCCA...
6,Allele_253486,-290.144916,Cluster 19471,MEVVALALALAAYLGLAAACLWLRRELRRLQAALAQQAEEGAARER...,,,cpoB_2,Cell division coordinator CpoB,,,...,,,,,,,,,,ATGGAGGTCGTCGCGCTGGCACTGGCGCTCGCCGCCTACCTCGGCC...
7,Allele_266708,-262.603769,Cluster 680,MTTQQLLTPHQSQYFAWLLTRRAAGDSVESLASTLVDSQVDLNPHQ...,3.6.4.-,,rapA_3,RNA polymerase-associated protein RapA,,,...,,,,,,,,,,TTGACGACACAGCAGCTTCTGACGCCGCACCAGAGCCAGTACTTTG...
8,Allele_240337,-262.421041,Cluster 2695,MNALLALPDLQPERLFVQSGDVRLAVHCWGAPDNDKPTLLMVHGYP...,1.-.-.-,COG:COG1028,ephD,putative oxidoreductase EphD,,,...,,,,,,,,,,ATGAACGCGCTCCTCGCCCTCCCCGACCTGCAGCCGGAACGGCTGT...
9,Allele_248932,257.877684,Cluster 1241,MREKQESGSVPVPAEFMSAQSAIVGLRGKDLLTTVRSLAVHGLRQP...,2.3.1.-,COG:COG3243,phaC_2,Poly(3-hydroxyalkanoate) polymerase subunit PhaC,,,...,,,poly(3-hydroxyalkanoic acid) synthase,Butanoate metabolism,PA14_66840,,,,,ATGCGAGAAAAGCAGGAATCGGGTAGCGTGCCGGTGCCCGCCGAGT...


In [30]:
selected_features_final.sort_values(by = 'score')[['score','gene', 'product', 'Reaction(s)']]

Unnamed: 0,score,gene,product,Reaction(s)
0,-546.984005,esiB,Secretory immunoglobulin A-binding protein EsiB,
2,-428.705236,serB,Phosphoserine phosphatase,
3,-365.654298,,Delta(1)-pyrroline-2-carboxylate reductase,
4,-347.360962,hprA,Glycerate dehydrogenase,glycerate dehydrogenase
5,-307.213499,,hypothetical protein,
6,-290.144916,cpoB_2,Cell division coordinator CpoB,
7,-262.603769,rapA_3,RNA polymerase-associated protein RapA,
8,-262.421041,ephD,putative oxidoreductase EphD,
10,-240.477246,pyrB,Aspartate carbamoyltransferase,aspartate carbamoyltransferase
12,-218.599529,gpuA,Guanidinopropionase,arginase AND agmatinase


In [None]:
'phaC_2': 'involved in alginate production' #The role of polyhydroxyalkanoate biosynthesis by Pseudomonas aeruginosa in rhamnolipid and alginate production as well as stress tolerance and biofilm formation

