In [1]:
# https://www.cell.com/trends/microbiology/pdf/S0966-842X(16)00021-4.pdf
# https://www.nature.com/articles/s41564-017-0001-x.pdf?draft=marketing staph death
# https://www.pnas.org/content/pnas/103/22/8305.full.pdf

In [4]:
# ideas: map to database of known functional proteins and identify mutations wrt functional form
# map to cd-hit rep (not as great but an alternative)

from bioservices import UniProt
import pickle
import pandas as pd
import cobra
from collections import defaultdict, OrderedDict, Counter
from cobra import Reaction, Metabolite, Model, Gene
import time
from glob import glob

from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.patches as mpatches
from Bio import SeqIO
import re
import os
import scipy
import urllib

import sys
sys.path.append("/home/yara/Documents/PseudoFind")
from PseudoFind.functional_annotations import *
from PseudoFind.pangenome_cmds import *
%matplotlib inline

In [5]:
# the problem with swissprot is that only alignments at the amino acid level are allowed, otherwise, I might have to go back and retrieve the nucleotide sequences using the cross-references
# another issue is that only a reduced number of matches is made, 
# another source of ref CDSs would be Bartell's curated genes
# but ultimately, it means that I just need to find a reliable functional form for each of the 210 selected features

In [6]:
def get_AAseqs_from_BR_json(json_file, BR):
    import json
    with open(json_file) as json_file:
        data = json.load(json_file)

    for index in BR.index:
        query_i = [i for i in range(len(data['BlastOutput2'])) if data['BlastOutput2'][i]['report']['results']['search']['query_title'] == BR.loc[index, 'Allele ID']][0]
        hit_i = [i for i in range(len(data['BlastOutput2'][query_i]['report']['results']['search']['hits'][0]['description'])) 
                 if data['BlastOutput2'][query_i]['report']['results']['search']['hits'][0]['description'][i]['accession'] == BR.loc[index, 'Uniprot ID'].split('.')[0]][0]
        hit_seq = data['BlastOutput2'][query_i]['report']['results']['search']['hits'][0]['hsps'][hit_i]['hseq']
        query_seq = data['BlastOutput2'][query_i]['report']['results']['search']['hits'][0]['hsps'][hit_i]['qseq']
        BR.loc[index, 'hseq'] = hit_seq
        BR.loc[index, 'qseq'] = query_seq
    return BR

In [11]:
# query acc.ver, subject acc.ver, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score, % positives
directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation/'
BR = pd.read_csv('%s/blast_uniprot_res.csv'%directory,
           names = ['Allele ID', 'Uniprot ID', 'PID','aln_length', 'mismatchCount', 'gapOpenCount', 'queryStart', 'queryEnd', 'subjectStart', 'subjectEnd', 'eVal', 'bit score', '% positives'])
BR = BR.loc[(BR['eVal'] < 10**(-9)) & (BR['PID'] > 80)]
BR = BR.sort_values(by = 'PID', ascending = False).drop_duplicates(subset = 'Allele ID', keep = 'first')

json_file = '%s/blast_uniprot_res.json'%directory
# BR = get_AAseqs_from_BR_json(json_file, BR)
    
for index in BR.index:
    
    uniprot_id = BR.loc[index, 'Uniprot ID'].split('.')[0]
    u = UniProt()
    res = u.retrieve( uniprot_id, frmt="xml")
    BR.loc[index, 'gene name'] = re.findall('(?<=\>).+?(?=\<\/name)', res.data)[1]
    BR.loc[index, 'patric'] = ', '.join(re.findall('(?<=")fig\|.+?(?=")', res.data))
    BR.loc[index, 'annotation'] = ','.join(re.findall('(?<=description\=").+?(?=")', res.data))
    BR.loc[index, 'KEGG'] = ','.join(re.findall('(?<=type\="KEGG"\ id\=").+?(?=")', res.data))
    
BR = BR.drop(BR.loc[BR['KEGG'] == ''].index)
BR['KEGG_f'] = [x.split(':')[1].split(',')[0] for x in BR['KEGG']]

In [12]:
FT = get_FT_from_gb('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/PAO1_1.gb').set_index('locus_tag')
FT = FT.append(get_FT_from_gb('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation/PA14.gb').set_index('old_locus_tag'))
FT = FT.append(get_FT_from_gb('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation/PA7.gb').set_index('old_locus_tag'))
BR_all = BR.merge(FT[['Nucleotide_sequence']], left_on = 'KEGG_f', right_index = True, how = 'left').drop_duplicates().fillna('')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [15]:
annotated = BR_all[['Allele ID', 'KEGG_f', 'Nucleotide_sequence', 'gene name', 'annotation']].rename(columns = {'KEGG_f':'Ref locus_tag'})
annotated['reference'] = ['blastp_uniprot']*len(annotated)

selected_features_final = pd.read_csv('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation/selected_features_final_m.csv', index_col = ['Unnamed: 0']).fillna('')
bartell = selected_features_final.loc[selected_features_final['Reaction(s)'] != ''].set_index('Allele ID').reset_index().rename(columns = {'Reaction(s)':'annotation', 'gene':'gene name', 'locus_tag':'Ref locus_tag'})
bartell = bartell.merge(FT[['Nucleotide_sequence']], left_on = 'Ref locus_tag', right_index = True)
bartell['reference'] = ['Bartell']*len(bartell)
annotated = annotated.append( bartell[annotated.columns])

In [16]:
directory = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation'
with open('%s/VFDB_pseudomonas.ffn'%directory, 'r') as f:
    string = f.read()
gene_seqs = {x.split('\n')[0].replace('>',''): x.split('\n')[1] for x in string.split('\n>')}

VFDB = selected_features_final.loc[selected_features_final['VFDB hit'] != '']
VFDB = VFDB.rename(columns = {'gene_id(VFDB)': 'Ref locus_tag', 'description(VFDB)':'annotation', 'gene_name(VFDB)':'gene name'})
VFDB['Nucleotide_sequence'] = [gene_seqs[x] for x in VFDB['Ref locus_tag']]
VFDB['reference'] = ['VFDB']*len(VFDB)
annotated = annotated.append(VFDB[annotated.columns])

In [17]:
annotated = annotated.drop_duplicates(subset = ['Allele ID'])
annotated = annotated.drop(annotated.loc[annotated['Nucleotide_sequence'] == ''].index)
annotated = annotated.reset_index()
del annotated['index']

In [31]:
annotated1 = pd.read_csv('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation/literature_benchmark_matches.csv', index_col = ['Unnamed: 0']).rename(columns = {'subject':'Allele ID'})
annotated1 = annotated1.set_index('Allele ID').loc[set(annotated1['Allele ID']) - set(annotated['Allele ID'])].reset_index().drop_duplicates(subset = ['Allele ID'])
annotated1 = annotated1.rename(columns = {'locus_tag':'Ref locus_tag', 'gene_name':'gene name', 'description':'annotation', 'nucleotide_sequence':'Nucleotide_sequence', 'evidence':'reference'})
cols = list(set(annotated.columns) & set(annotated1.columns))
annotated = annotated.append(annotated1)
annotated  = annotated.merge(selected_features_final[['Allele ID', 'allele_nucleotide_sequence']], on = 'Allele ID')

# align and get mutations

In [46]:
from PseudoFind.lof_detection import *

In [61]:
for index in annotated.index:
    allele_id = annotated.loc[index, 'Allele ID']
    ref_seq = annotated.loc[index, 'Ref locus_tag']
    sequences = {ref_seq: annotated.loc[index, 'Nucleotide_sequence'], allele_id: annotated.loc[index, 'allele_nucleotide_sequence']}
    point_mutations, deletions, insertions = get_alignment_summary(ref_seq, allele_id, sequences)

    annotated.loc[index, 'SNP -n'] = ' AND '.join(point_mutations)
    annotated.loc[index, 'deletions -n'] = ' AND '.join(['%s%s'%(x,y) for x,y in deletions.items()])
    annotated.loc[index, 'insertions -n'] = ' AND '.join(['%s%s'%(x,y) for x,y in insertions.items()])
    annotated.loc[index, 'major deletions -n'] = ' AND '.join(['%s%s'%(x,y) for x,y in deletions.items() if len(y) > 30])
    annotated.loc[index, 'major insertions -n'] = ' AND '.join(['%s%s'%(x,y) for x,y in insertions.items() if len(y) > 30])
    annotated.loc[index, 'start -n'] = '' if sequences[ref_seq][:3] == sequences[allele_id][:3] else '%s->%s'%(sequences[ref_seq][:3], sequences[allele_id][:3])
    

    sequences = {ref_seq: Seq(annotated.loc[index, 'Nucleotide_sequence']).translate(), allele_id: Seq(annotated.loc[index, 'allele_nucleotide_sequence']).translate()}

    point_mutations, deletions, insertions = get_alignment_summary(ref_seq, allele_id, sequences)
    annotated.loc[index, 'SNP -p'] = ' AND '.join(point_mutations)
    annotated.loc[index, 'deletions -p'] = ' AND '.join(['%s%s'%(x,y) for x,y in deletions.items()])
    annotated.loc[index, 'insertions -p'] = ' AND '.join(['%s%s'%(x,y) for x,y in insertions.items()])
    annotated.loc[index, 'major deletions -p'] = ' AND '.join(['%s%s'%(x,y) for x,y in deletions.items() if len(y) > 10])
    annotated.loc[index, 'major insertions -p'] = ' AND '.join(['%s%s'%(x,y) for x,y in insertions.items() if len(y) > 10])
    annotated.loc[index, 'stops'] = ' AND '.join({str(x.start()) for x in re.finditer('\*', str(sequences[allele_id]))})
    annotated.loc[index, 'start -p'] = '' if sequences[ref_seq][0] == sequences[allele_id][0] else '%s->%s'%(sequences[ref_seq][0], sequences[allele_id][0]) 
annotated.to_csv('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation/alignments_v1.csv')    

In [75]:
for i in annotated.index:
    SNP_n, ins_n, del_n = annotated.loc[i].to_dict()['SNP -n'], annotated.loc[i].to_dict()['insertions -n'], annotated.loc[i].to_dict()['deletions -n']
    tot_n = sum([len(y) for y in re.findall('(?<=[0-9])[A-Z]+', ' AND '.join([SNP_n, ins_n, del_n])) ])

    SNP_p, ins_p, del_p = annotated.loc[i].to_dict()['SNP -p'], annotated.loc[i].to_dict()['insertions -p'], annotated.loc[i].to_dict()['deletions -p']
    tot_p = sum([len(y) for y in re.findall('(?<=[0-9])[A-Z]+', ' AND '.join([SNP_p, ins_p, del_p])) ])
    frameshift = tot_n < tot_p
    if frameshift == True:
        print('frameshift')
        
    inframe_indel = annotated.loc[i, 'major deletions -n'] != '' or  annotated.loc[i, 'major insertions -n'] != ''
    
    if inframe_indel:
        print('inframe_indel')

inframe_indel
frameshift
inframe_indel
inframe_indel
inframe_indel
inframe_indel
inframe_indel
inframe_indel
inframe_indel
inframe_indel
inframe_indel


In [None]:
allele = 'Allele_154447'

In [65]:
annotated.loc[annotated['insertions -p'] != '']

Unnamed: 0,Allele ID,Nucleotide_sequence,Ref locus_tag,annotation,gene name,reference,allele_nucleotide_sequence,SNP -n,deletions -n,insertions -n,major deletions -n,major insertions -n,start -n,SNP -p,deletions -p,insertions -p,major deletions -p,major insertions -p,stops,start -p
5,Allele_258341,ATGCTGATCCTGCGCGGCGCTCCCGCCCTTTCCGCTTTCCGCCACG...,PA3763,"Phosphoribosylformylglycinamidine synthase,Glu...",purL,blastp_uniprot,ATGCTGATCCTGCGCGGCGCTCCCGCCCTTTCCGCTTTCCGCCACG...,C1578T AND G2091A AND G2736T AND C3249T AND G1...,1030A,1032G,,,,V726I,852A,854V,,,1298,
6,Allele_235559,ATGAGCGTGGAAACCCAAAAAGAGACACTGGGCTTTCAGACTGAAG...,PA1596,"Chaperone protein HtpG,A; substrate-binding,B,C",htpG,blastp_uniprot,ATGAGCGTGGAAACCCAAAAAGAGACACTGGGCTTTCAGACTGAAG...,A639G AND T883C AND C528T,502C AND 461C,504T AND 463G,,,,,294Y,296H,,,634,
10,Allele_304107,GTGCTTGAAGCCTACCGCAAACACGTAGAAGAGCGTGCCGCCCAGG...,PA1787,"Aconitate hydratase B,Substrate binding,Substr...",acnB,blastp_uniprot,GTGCTTGAAGCCTACCGCAAACACGTAGAAGAGCGTGCCGCCCAGG...,G1713A AND G1779A AND C1920G AND C230G,2048A AND 197A AND 1927C AND 310A,2050G AND 200T AND 1929T AND 312G,,,,D640E,76T,78S,,,869,
13,Allele_54492,ATGGACGGCCTGCGCCTGCGCTTCCGCCGCGCCTACCCCGGCTTCG...,PA1861,"Molybdenum import ATP-binding protein ModC,ABC...",modC,blastp_uniprot,ATGGACGGCCTGCGCCTGCGCTTCCGCCGCGCCTACCCCGGCTTCG...,C705T AND G854A AND T783C,377T,379C AND 1085CGGCGCCT,,,,R285H,,361CGA,,,364,
15,Allele_156781,GTGTATACGCCAGCAAACAATCATAACAGGAGCTTGGCCATGAGTA...,PA1538,"Baeyer-Villiger monooxygenase,FAD,NADP,NADP,NA...",PA1538,blastp_uniprot,ATGAGTACCCAACCCACCCCTGCCGCCGCCCGGCACTGCAAGGTCG...,T362C AND G1134A,0GTGTATACGCCAGCAAACAATCATAACAGGAGCTTGGCC AND 1...,1149C,0GTGTATACGCCAGCAAACAATCATAACAGGAGCTTGGCC,,GTG->ATG,S383T,0VYTPANNHNRSLA AND 119L,121P,0VYTPANNHNRSLA,,514,V->M
23,Allele_154447,ATGAGTCGTGAAGCCCTGCAGGAAACTCTGTCCGCTGTGATGGATA...,PA0763,"Sigma factor AlgU negative regulatory protein,...",mucA,blastp_uniprot,ATGAGGCGTGAAGCCCTGCAGGAAACTCTGTCCGCTGTGATGGATA...,T6G,340A,342G,,,,,1S,3R,,,194,
33,Allele_272276,ATGCGTGAGGAAACCCCCGAGCAGCCCGCGCCGTTGCGCAGCGGCT...,PA14_26470,Cobalt-precorrin-5B C(1)-methyltransferase,cbiD,blastp_uniprot,ATGCGTGAGGAAACCCCCGAGCAGCCCGCGCCGTTGCGCAGCGGCT...,G964N AND T885C AND T837C AND T859C AND A892G ...,871C,873T,,,,S303C AND V322X,297T,299A,,,366,
36,Allele_260631,ATGAACGACAGCATCCAACTGAGCGGCCTGTCCCGACAGCTCGTCC...,PA4526,"Type 4 fimbrial assembly protein PilB,ATP,In s...",pilB,blastp_uniprot,ATGAACGATTCCACGCCTCTCAGCGGCCTTGCCAAGCAACTGGTCC...,G1029C AND G1545C AND G21CTC AND C279G AND G53...,514A AND 521TGC AND 13T AND 16AA AND 530A AND ...,256GCC AND 512G AND 8TT AND 267G AND 525GCG AN...,,,,IQ5TP AND L138V AND SR11AK AND D141E AND QA16L...,128I AND 149GL AND 152G AND 167DK AND 170T AND...,130L AND 154GG AND 155LG AND 49Q AND 165KAE AN...,,,566,
38,Allele_55447,ATGAGCCATGCCCTGCGCGCCGTTTTCCTCGACCATGCCTCCCTCG...,PA14_61210,glycerate dehydrogenase,hprA,Bartell,ATGAGCCATGCCCTGCGCGCCGTTTTCCTCGACCATGCCTCCCTCG...,T340C AND G661T AND A345G AND G106A AND C396T,390A AND 558C AND 115A AND 56C AND 216C,392G AND 561T AND 117G AND 218T AND 59T,,,,A36T AND A221S,130S,132G,,,323,
45,Allele_13648,GTGACCGCTCCCTTCAACGCCTTGCTGATCGCCAACCGCGGCGAGA...,PA14_46320,pyruvate carboxylase,accA_2,Bartell,GTGACCGCTCCCCCCTTCAACGCCTTGCTGATCGCCAACCGCGGCG...,A642G AND T3138C AND C138G AND G1803T AND C308...,518A AND 1601C AND 402G AND 1476CC AND 931T AN...,1921T AND 520G AND 12CCC AND 406A AND 933C AND...,,,,K1027E AND K476E AND A837P AND V136I AND P493S...,351A,353V AND 4P,,,1096,


In [None]:
fna_file = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/prokka/287.9335/287.9335.fna'


In [None]:
mlst_cmd = 'mlst /home/yara/Documents/cystic_fibrosis/data/pseudomonas/prokka/*/*.fna >> /home/yara/Documents/cystic_fibrosis/data/pseudomonas/tseeman_mlst.csv'

In [77]:
glob('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/prokka/*/*.fna')[0]

'/home/yara/Documents/cystic_fibrosis/data/pseudomonas/prokka/287.9335/287.9335.fna'

In [56]:
allele2cdhit = pickle.load(open('%s/allele_matrix/allele2cdhit.p'%directory, 'rb'))
allele2cdhit_rev = defaultdict(list)
for allele, cdhit in allele2cdhit.items():
    allele2cdhit_rev[cdhit].append(allele)

In [None]:
fasta_files = [x for x in glob('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/prokka/*/*.faa') if x.split('/')[-1].split('.faa')[0] in list(FT_LRI['Genome ID'])]

In [59]:
all_variants = allele2cdhit_rev[allele2cdhit[allele_id]]
locus2allele = {y:x for x,y in pickle.load(open('%s/allele_matrix/locus2allele.p'%directory, 'rb')).items()}
rows = {}
for allele_id in all_variants:
    locus_tag = locus2allele[allele_id]
    gid = '_'.join(locus_tag.split('_')[:-1])
    ffn_file = '/home/yara/Documents/cystic_fibrosis/data/pseudomonas/prokka/%s/%s.ffn'%(gid, gid)
    with open(ffn_file, 'r') as f:
        string = f.read()
    gene_seqs = {x.split('\n')[0].replace('>','').split(' ')[0]:''.join(x.split('\n')[1:]) for x in string.split('\n>')} 
    rows[allele_id] = gene_seqs[locus_tag]

In [50]:
annotated.loc[index].to_dict()

{'Allele ID': 'Allele_16226',
 'Nucleotide_sequence': 'ATGAAAACGACGCAGTACGTGGCCCGCCAGCCCGACGACAACGGTTTCATCCACTATCCGGAAACCGAGCACCAGGTCTGGAATACCCTGATCACCCGGCAACTGAAGGTGATCGAAGGCCGCGCCTGTCAGGAATACCTCGACGGCATCGAACAGCTCGGCCTGCCCCACGAGCGGATCCCCCAGCTCGACGAGATCAACAGGGTTCTCCAGGCCACCACCGGCTGGCGCGTGGCGCGGGTTCCGGCGCTGATTCCGTTCCAGACCTTCTTCGAACTGCTGGCCAGCCAGCAATTCCCCGTCGCCACCTTCATCCGCACCCCGGAAGAACTGGACTACCTGCAGGAGCCGGACATCTTCCACGAGATCTTCGGCCACTGCCCACTGCTGACCAACCCCTGGTTCGCCGAGTTCACCCATACCTACGGCAAGCTCGGCCTCAAGGCGAGCAAGGAGGAACGCGTGTTCCTCGCCCGCCTGTACTGGATGACCATCGAGTTCGGCCTGGTCGAGACCGACCAGGGCAAGCGCATCTACGGCGGCGGCATCCTCTCCTCGCCGAAGGAGACCGTCTACAGCCTCTCCGACGAGCCGCTGCACCAGGCCTTCAATCCGCTGGAGGCGATGCGCACGCCCTACCGCATCGACATCCTGCAACCGCTCTATTTCGTCCTGCCCGACCTCAAGCGCCTGTTCCAACTGGCCCAGGAAGACATCATGGCGCTGGTCCACGAGGCCATGCGCCTGGGCCTGCACGCGCCGCTGTTCCCGCCCAAGCAGGCGGCCTGA',
 'Ref locus_tag': 'PA0872',
 'SNP -n': 'G771A',
 'SNP -p': '',
 'allele_nucleotide_sequence': 'ATGAAAACGACGCAGTACGTGGCCCGCCAGCCCGACGACAACGGTTTCATCCACTATC

In [None]:
{'wspF', 'pelA', 'algU', 'pilQ'} # still missing

In [113]:
print(list(annotated['gene name']))

['dadA1', 'hisF2', 'fliF', 'infB', 'mrcA', 'htpG', 'algG', 'lap', 'atpA', 'fliI', 'secY', 'metZ', 'PA2181', 'hisC1', 'PLES_57141', 'macB', 'tuf1', 'dsbD2', 'proS', 'bphP', 'speE2', 'speE1', 'pfeA', 'pilH', 'aprE', 'ampR', 'cobB2', 'pqqF', 'pqqF', 'cynS', 'cheB3', 'tgpA', 'mucA', 'PA0329', 'malP', 'lgrD_2', 'codA', 'algX', 'atpA', 'yejF_1', '', 'aat', 'btuF_1', 'hutI', '', 'ttuE', 'pikAV', 'paaH_1', 'metZ', 'hisC_3', 'guaD_3', 'amtB_2', 'dadA1_3', 'cynS', 'proY_2', 'nicT', 'sdaA', 'sdaA', 'rfbD', '', 'betA_1', 'acnM', 'dltA_3', 'lhgD_1', 'acoD', 'acoD', 'aroF_1', '', 'coaBC', 'bdhA', '', 'mmgC_1', 'ilvI', 'speE_2', 'pvdI', 'algG', 'cupB2', 'fliI', 'phuT', 'pchD', 'pilH', 'pvdD', 'pscD', 'pchE', 'hsiH3', 'mucA', 'pscD', 'PA4541', 'hisF2', 'clpV2', 'pvdL', 'PLES_19111', 'pvdD']


In [114]:
blast_hits = pickle.load(open('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/pangenome/BR_results/blast_hits.p', 'rb'))

In [118]:
SNPs_FT = pd.read_excel('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/indel_list.xlsx', skiprows = 5).fillna('')
del SNPs_FT['Unnamed: 0']
to_drop = SNPs_FT.loc[SNPs_FT['Type of mutation'] == 'Intergenic Deletion'].index.tolist() + SNPs_FT.loc[SNPs_FT['Type of mutation'] == 'Intergenic Insertion'].index.tolist() 
to_drop += SNPs_FT.loc[SNPs_FT['Gene locus tag'] == 'Gene locus tag'].index.tolist() + SNPs_FT.loc[SNPs_FT['Gene locus tag'] == ''].index.tolist()
SNPs_FT = SNPs_FT.drop(to_drop)

indel_FT = pd.read_excel('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/indel_list.xlsx', skiprows = 5)
del indel_FT['Unnamed: 0']
to_drop = indel_FT.loc[indel_FT['Type of mutation'] == 'Intergenic Deletion'].index.tolist() + indel_FT.loc[indel_FT['Type of mutation'] == 'Intergenic Insertion'].index.tolist() 
to_drop += indel_FT.loc[indel_FT['Gene locus tag'] == 'Gene locus tag'].index.tolist() + indel_FT.loc[indel_FT['Gene locus tag'] == ''].index.tolist()
indel_FT = indel_FT.drop(to_drop)

In [117]:
len(blast_hits)

908

In [35]:
matches_df = pd.read_csv('/home/yara/Documents/cystic_fibrosis/data/pseudomonas/functional_annotation/literature_matches_df.csv', index_col = ['Unnamed: 0'])
found_ref.update({x:'literature' for x in set(matches_df['Allele ID']) if x not in list(found_ref.keys())})

In [12]:
found_SF = selected_features_final.set_index('Allele ID').loc[list(found_ref.keys()) ][cols]
'wspF' in set(found_SF['gene'])

False

In [15]:
cols = ['score_x', 'gene', 'product','gene name (eggnog)','kegg', 'bigg']
selected_features_final.set_index('Allele ID').drop( list(found_ref.keys()) )[cols]

Unnamed: 0_level_0,score_x,gene,product,gene name (eggnog),kegg,bigg
Allele ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Allele_253547,-121.348162,gltR_1,HTH-type transcriptional regulator GltR,,,
Allele_95110,-91.249517,,hypothetical protein,CTI,,
Allele_29692,-87.392861,sspA_1,Stringent starvation protein A,,K00799,
Allele_257147,66.546706,,hypothetical protein,,,
Allele_17770,-65.041182,nicP_6,Porin-like protein NicP,,,
Allele_251891,-64.804302,sasA_16,Adaptive-response sensory-kinase SasA,,,
Allele_162869,61.666037,,hypothetical protein,,,
Allele_289377,-54.053481,ttgE,Toluene efflux pump membrane transporter TtgE,,K18303,
Allele_299933,-51.987558,swrC,Swarming motility protein SwrC,,,
Allele_258244,49.043032,secF_2,Protein-export membrane protein SecF,,K07003,


In [None]:
# assemble the matches together for now and examine the distribution of mutations 



In [None]:
# comparing to a functional reference
# what if I were to compare with other alleles of the same gene family (?), how would I go about doing that?
# it's really not easy to interpret the results ...

for index in selected_features_2.loc[selected_features_2['SwissPROT'] != ''].index:
    
    swiss_id = selected_features_2.loc[index, 'SwissPROT']
    allele_id = selected_features_2.loc[index, 'allele_id']
    seqs = {swiss_id:swissprot_seqs[swiss_id],
           allele_id:allele_seqs[allele_id]}
    SNPs, deletions, insertions = get_alignment_summary(swiss_id, allele_id, seqs)
    
    print(len(SNPs), len(deletions), len(insertions))