In [224]:
import pandas as pd
import numpy as np

In [35]:
# Create dataframe from blast output
blast_data = []
with open('serotyped_blastdb.output') as fh:
    for line in fh:
        fields = line.strip().split()
        blast_record = {
            'qseqid': fields[0],
            'qlen': fields[1],
            'qseq': fields[2],
            'sseqid': fields[3],
            'length': fields[4],
            'sseq': fields[5],
            'pident': fields[6],
            'qcovhsp': fields[7]
        }
        blast_data.append(blast_record)
blast_df = pd.DataFrame(blast_data)
# Read strains metadata
strains_df = pd.read_json('strains.json')[['assembly_barcode', 'serotype']]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87627 entries, 0 to 87626
Data columns (total 8 columns):
length     87627 non-null object
pident     87627 non-null object
qcovhsp    87627 non-null object
qlen       87627 non-null object
qseq       87627 non-null object
qseqid     87627 non-null object
sseq       87627 non-null object
sseqid     87627 non-null object
dtypes: object(8)
memory usage: 5.3+ MB


In [44]:
# Get barcode of each genome
blast_df['assembly_barcode'] = blast_df['sseqid'].str.split('|').str[0]
# Calculate score of each hit
blast_df['score'] = blast_df['qcovhsp'].astype(float)*blast_df['pident'].astype(float)/10000
# Get number of unique genomes
blast_df[~blast_df.duplicated('assembly_barcode')].shape[0]

5796

In [72]:
# Inspect serotype tags
## Regex
regex_str_O = '(?<![A-Z])O(\d{1,3})(?!\d)'
regex_str_H = '(?<![A-Z])H(\d{1,3})(?!\d)'
## Extract
O_serotype_s = strains_df['serotype'].str.extract(regex_str_O, expand=False)
H_serotype_s = strains_df['serotype'].str.extract(regex_str_H, expand=False)
invalid_serotype_s = strains_df[O_serotype_s.isnull() & H_serotype_s.isnull()]
serotype_df = strains_df.copy()
serotype_df['genome_serotype_O']=O_serotype_s
serotype_df['genome_serotype_H']=H_serotype_s
# Merge with blast output
merged_df = blast_df.merge(serotype_df, left_on='assembly_barcode', right_on='assembly_barcode', how='left')
# Get serotype of allele
merged_df['allele_serotype_O'] = merged_df['qseqid'].str.extract(regex_str_O, expand=False)
merged_df['allele_serotype_H'] = merged_df['qseqid'].str.extract(regex_str_H, expand=False)
# Inpsect ones missing both allele serotype (all of them have novel serotype tag)
merged_df[merged_df['allele_serotype_O'].isnull()&merged_df['allele_serotype_H'].isnull()]

Unnamed: 0,length,pident,qcovhsp,qlen,qseq,qseqid,sseq,sseqid,assembly_barcode,score,serotype,genome_serotype_O,genome_serotype_H,allele_serotype_O,allele_serotype_H
56416,753,100.00,100,753,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,6__wzm__wzm-Onovel32__523,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,ESC_HA8399AA_AS|O101:H37|NODE_53_length_16111_...,ESC_HA8399AA_AS,1.000000,O101:H37,101,37,,
56417,753,100.00,100,753,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,6__wzm__wzm-Onovel32__523,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,ESC_IA7098AA_AS|O101:H9|NODE_57_length_13084_c...,ESC_IA7098AA_AS,1.000000,O101:H9,101,9,,
56418,753,100.00,100,753,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,6__wzm__wzm-Onovel32__523,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,ESC_NA9415AA_AS|O9|NODE_45_length_16112_cov_31...,ESC_NA9415AA_AS,1.000000,O9,9,,,
56419,753,100.00,100,753,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,6__wzm__wzm-Onovel32__523,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,ESC_IA7147AA_AS|O101:H9|NODE_59_length_13076_c...,ESC_IA7147AA_AS,1.000000,O101:H9,101,9,,
56420,753,100.00,100,753,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,6__wzm__wzm-Onovel32__523,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,ESC_HA8003AA_AS|O9:H12|NODE_48_length_20433_co...,ESC_HA8003AA_AS,1.000000,O9:H12,9,12,,
56421,753,100.00,100,753,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,6__wzm__wzm-Onovel32__523,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,ESC_MA3602AA_AS|O89|NODE_51_length_13087_cov_5...,ESC_MA3602AA_AS,1.000000,O89,89,,,
56422,753,100.00,100,753,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,6__wzm__wzm-Onovel32__523,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,ESC_HA7913AA_AS|O28|NODE_47_length_10147_cov_2...,ESC_HA7913AA_AS,1.000000,O28,28,,,
56423,753,100.00,100,753,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,6__wzm__wzm-Onovel32__523,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,ESC_HA8990AA_AS|O101|NODE_56_length_15991_cov_...,ESC_HA8990AA_AS,1.000000,O101,101,,,
56424,753,100.00,100,753,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,6__wzm__wzm-Onovel32__523,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,ESC_IA1271AA_AS|O162|NODE_70_length_11790_cov_...,ESC_IA1271AA_AS,1.000000,O162,162,,,
56425,753,100.00,100,753,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,6__wzm__wzm-Onovel32__523,ATGTCTCATTTGATTGACTTAAAAGATGTGGGTGTCGAATTTCCCA...,ESC_HA8768AA_AS|O135|NODE_64_length_9523_cov_1...,ESC_HA8768AA_AS,1.000000,O135,135,,,


In [60]:
# Get gene of the allele
GENE_LIST = ['wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA']
merged_df['allele_gene'] = merged_df['qseqid'].str.extract('('+'|'.join(GENE_LIST)+')', expand=False)
# Create list of blacklist genomes for ones that have 100% score and does not match allele serotype

perfect_matches_s = merged_df['score']==1
O_mismatches_s = \
    merged_df['allele_serotype_'+'O'].notnull()&\
    merged_df['genome_serotype_'+'O'].notnull()&\
    (merged_df['allele_serotype_'+'O']!=merged_df['genome_serotype_'+'O'])
H_mismatches_s = \
    merged_df['allele_serotype_'+'H'].notnull()&\
    merged_df['genome_serotype_'+'H'].notnull()&\
    (merged_df['allele_serotype_'+'H']!=merged_df['genome_serotype_'+'H'])
USEFUL_COLUMNS = ['assembly_barcode', 'qseqid', 'score', 'allele_serotype_O', 
                  'genome_serotype_O',  'allele_serotype_H', 'genome_serotype_H']
blacklist_df = merged_df[perfect_matches_s&(O_mismatches_s|H_mismatches_s)]
blacklist_df = blacklist_df[~blacklist_df.duplicated('assembly_barcode')]
blacklist = list(blacklist_df['assembly_barcode'])
# Remove blacklist genomes from result
# filtered_df = merged_df[]
filtered_df = merged_df[~merged_df['assembly_barcode'].isin(blacklist)]
# Get number of remaining genomes
filtered_df[~filtered_df.duplicated('assembly_barcode')].shape[0]

5307

In [191]:
# Extract alleles
## Get all the hit with matching serotype
O_matches_s = \
    filtered_df['allele_serotype_'+'O'].notnull()&\
    filtered_df['genome_serotype_'+'O'].notnull()&\
    (filtered_df['allele_serotype_'+'O']==filtered_df['genome_serotype_'+'O'])
H_matches_s = \
    filtered_df['allele_serotype_'+'H'].notnull()&\
    filtered_df['genome_serotype_'+'H'].notnull()&\
    (filtered_df['allele_serotype_'+'H']==filtered_df['genome_serotype_'+'H'])
## From the filtered alleles, we need ones with pident less than 1 but higher than .97
## (blast already filter out <.97)
print("Remove perfect matches")
print("Before: %d seqs" %filtered_df.shape[0])
similar_s = filtered_df['pident'].astype(float)<100
similar_df = filtered_df[(O_matches_s | H_matches_s) & similar_s]
# Remove duplicated sequences
print("Remove duplicates")
print("Before: %d seqs" %similar_df.shape[0])
similar_nodup_df = similar_df[~similar_df.duplicated('sseq')]
print("After: %d seqs" %similar_nodup_df.shape[0])
# Consider gene pair
## If a genome only has unpaired allele gene, remove the unpaired ones
GENE_PAIRS = {'wzx':'wzy', 'wzy':'wzx', 'wzm':'wzt', 'wzt':'wzm'}
paired_df = matches_nodup_df.copy()
print("Remove unpaired alleles")
print("Before: %d seqs" %paired_df.shape[0])
for name, group in paired_df.groupby('assembly_barcode'):
    gene_dict = dict(group['allele_gene'].items())
    for a, b in GENE_PAIRS.items():
        if a not in gene_dict.values():
            continue
        if b in gene_dict.values():
            continue
        # Remove both a and b
        for index, gene in gene_dict.items():
            if gene in [a, b]:
                paired_df.drop(index, inplace=True)
print("After: %d seqs" %paired_df.shape[0])

Remove perfect matches
Before: 84092 seqs
Remove duplicates
Before: 46345 seqs
After: 1514 seqs
Remove unpaired alleles
Before: 1514 seqs
After: 868 seqs


In [202]:
# Read EcOH data
from Bio import SeqIO
EcOH = []
with open('EcOH.fasta') as fh:
    for record in SeqIO.parse(fh, 'fasta'):
        entry = {
            'name': record.name,
            'seq': str(record.seq),
            'desc': record.description
        }
        EcOH.append(entry)
EcOH_df = pd.DataFrame(EcOH)
EcOH_df

Unnamed: 0,desc,name,seq
0,1__fliC__fliC-H1__1 AB028471.1;flagellin;H1,1__fliC__fliC-H1__1,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
1,1__fliC__fliC-H1__2 L07387.1;flagellin;H1,1__fliC__fliC-H1__2,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
2,1__fliC__fliC-H10__3 AY337482.1;flagellin;H10,1__fliC__fliC-H10__3,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
3,1__fliC__fliC-H10__4 AY249995.1;flagellin;H10,1__fliC__fliC-H10__4,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
4,1__fliC__fliC-H11__5 AY337465.1;flagellin;H11,1__fliC__fliC-H11__5,ATGGCACAAGTCATTAATACCAACAGCCTGTCGCTGTTGACCCAGA...
5,1__fliC__fliC-H11__6 AY337472.1;flagellin;H11,1__fliC__fliC-H11__6,ATGGCACAAGTCATTAATACCAACAGCCTGTCGCTGTTGACCCAGA...
6,1__fliC__fliC-H11__7 AY249996.1;flagellin;H11,1__fliC__fliC-H11__7,AACAAATCTCAGTCTTCTCTGAGCTCCGCCATTGAACGTCTCTCTT...
7,1__fliC__fliC-H12__8 AY337471.1;flagellin;H12,1__fliC__fliC-H12__8,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
8,1__fliC__fliC-H12__9 AY337474.1;flagellin;H12,1__fliC__fliC-H12__9,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
9,1__fliC__fliC-H12__10 AY249997.1;flagellin;H12,1__fliC__fliC-H12__10,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...


In [290]:
# Parse gene and serotype
## Regex
regex_str = '(?<![A-Z])([O|H]\d{1,3})(?!\d)'
## Extract serotype
serotype_s = EcOH_df['desc'].str.extract(regex_str, expand=False)

## Inspect invalid alleles -> All 'novel' allele
invalid_serotype_s = allele_df[serotype_s.isnull()]
allele_df = EcOH_df.copy()
allele_df['serotype']=serotype_s

## Extract gene
GENE_LIST = ['wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA']
allele_df['gene'] = EcOH_df['desc'].str.extract('('+'|'.join(GENE_LIST)+')', expand=False)
allele_df.drop('desc', axis=1, inplace=True)
allele_df.count()

name        665
seq         665
serotype    597
gene        665
dtype: int64

In [291]:
# Merge with paired_df
paired_df
# Clean results
new_allele_df = paired_df[['assembly_barcode', 'sseq', 'allele_serotype_O', 'allele_serotype_H', 'allele_gene']].copy()
new_allele_df.columns = [
    'assembly_barcode', 'seq', 'O_serotype', 'H_serotype', 'gene'
]
new_allele_df['desc']='from_'+new_allele_df['assembly_barcode']
new_allele_df.drop('assembly_barcode', axis=1, inplace=True)
new_allele_df['num']=0
for name, group in new_allele_df.groupby('desc'):
    break
new_allele_df['serotype']=np.nan
new_allele_df['serotype']=('O'+new_allele_df['O_serotype']).fillna('H'+new_allele_df['H_serotype'])
new_allele_df.drop(['O_serotype', 'H_serotype'], axis=1, inplace=True)
for name, group in new_allele_df.groupby('serotype'):
    for num, (key, val) in enumerate(group['serotype'].items()):
        new_allele_df.set_value(key, 'num', num+1)
new_allele_df['name']=new_allele_df['serotype']+'-'+new_allele_df['num'].astype(str)+'-'+new_allele_df['gene']
# Merge
allele_df.merge(new_allele_df, how='outer')

Unnamed: 0,name,seq,serotype,gene,desc,num
0,1__fliC__fliC-H1__1,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H1,fliC,,
1,1__fliC__fliC-H1__2,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H1,fliC,,
2,1__fliC__fliC-H10__3,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H10,fliC,,
3,1__fliC__fliC-H10__4,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H10,fliC,,
4,1__fliC__fliC-H11__5,ATGGCACAAGTCATTAATACCAACAGCCTGTCGCTGTTGACCCAGA...,H11,fliC,,
5,1__fliC__fliC-H11__6,ATGGCACAAGTCATTAATACCAACAGCCTGTCGCTGTTGACCCAGA...,H11,fliC,,
6,1__fliC__fliC-H11__7,AACAAATCTCAGTCTTCTCTGAGCTCCGCCATTGAACGTCTCTCTT...,H11,fliC,,
7,1__fliC__fliC-H12__8,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H12,fliC,,
8,1__fliC__fliC-H12__9,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H12,fliC,,
9,1__fliC__fliC-H12__10,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H12,fliC,,


In [303]:
# Write to ectpyer_data.fasta and ectyper_dict.json
from collections import defaultdict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
ectyper_dict = defaultdict(list)
ectyper_records = []
for index, row in allele_df.iterrows():
    record = SeqRecord(Seq(row['seq'], IUPAC.IUPACAmbiguousDNA),
                       id=row['name'],
                       description=None)
    ectyper_records.append(record)
    break
with open('ectyper_data.fasta', 'w') as fh:
    SeqIO.write(ectyper_records, fh, 'fasta')