In [97]:
import pandas as pd
import numpy as np

In [98]:
# Create dataframe from blast output
blast_data = []
with open('serotyped_blastdb.output') as fh:
    for line in fh:
        fields = line.strip().split()
        blast_record = {
            'qseqid': fields[0],
            'qlen': fields[1],
            'qseq': fields[2],
            'sseqid': fields[3],
            'length': fields[4],
            'sseq': fields[5],
            'pident': fields[6],
            'qcovhsp': fields[7]
        }
        blast_data.append(blast_record)
blast_df = pd.DataFrame(blast_data)
# Read strains metadata
strains_df = pd.read_json('strains.json')[['assembly_barcode', 'serotype']]

In [99]:
# Get barcode of each genome
blast_df['assembly_barcode'] = blast_df['sseqid'].str.split('|').str[0]
# Calculate score of each hit
blast_df['score'] = blast_df['qcovhsp'].astype(float)*blast_df['pident'].astype(float)/10000
# Get number of unique genomes
print("Number of genomes: %d" %blast_df[~blast_df.duplicated('assembly_barcode')].shape[0])

Number of genomes: 5796


In [100]:
# Inspect serotype tags
## Regex
regex_str_O = '(?<![A-Z])O(\d{1,3})(?!\d)'
regex_str_H = '(?<![A-Z])H(\d{1,3})(?!\d)'
## Extract
O_serotype_s = strains_df['serotype'].str.extract(regex_str_O, expand=False)
H_serotype_s = strains_df['serotype'].str.extract(regex_str_H, expand=False)
invalid_serotype_s = strains_df[O_serotype_s.isnull() & H_serotype_s.isnull()]
serotype_df = strains_df.copy()
serotype_df['genome_serotype_O']=O_serotype_s
serotype_df['genome_serotype_H']=H_serotype_s
# Merge with blast output
merged_df = blast_df.merge(serotype_df, left_on='assembly_barcode', right_on='assembly_barcode', how='left')
# Get serotype of allele
merged_df['allele_serotype_O'] = merged_df['qseqid'].str.extract(regex_str_O, expand=False)
merged_df['allele_serotype_H'] = merged_df['qseqid'].str.extract(regex_str_H, expand=False)
# Inpsect ones missing both allele serotype (all of them have novel serotype tag)
merged_df[merged_df['allele_serotype_O'].isnull()&merged_df['allele_serotype_H'].isnull()]
# Genome remaining
print("Number of genomes: %d" %merged_df[~merged_df.duplicated('assembly_barcode')].shape[0])
print("Number of sequences: %d" %merged_df.shape[0])

Number of genomes: 5796
Number of sequences: 87627


In [101]:
# Get gene of the allele
GENE_LIST = ['wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA']
merged_df['allele_gene'] = merged_df['qseqid'].str.extract('('+'|'.join(GENE_LIST)+')', expand=False)
# Create list of blacklist genomes for ones that have 100% score and does not match allele serotype

perfect_matches_s = merged_df['score']==1
O_mismatches_s = \
    merged_df['allele_serotype_'+'O'].notnull()&\
    merged_df['genome_serotype_'+'O'].notnull()&\
    (merged_df['allele_serotype_'+'O']!=merged_df['genome_serotype_'+'O'])
H_mismatches_s = \
    merged_df['allele_serotype_'+'H'].notnull()&\
    merged_df['genome_serotype_'+'H'].notnull()&\
    (merged_df['allele_serotype_'+'H']!=merged_df['genome_serotype_'+'H'])
USEFUL_COLUMNS = ['assembly_barcode', 'qseqid', 'score', 'allele_serotype_O', 
                  'genome_serotype_O',  'allele_serotype_H', 'genome_serotype_H']
blacklist_df = merged_df[perfect_matches_s&(O_mismatches_s|H_mismatches_s)]
blacklist_df = blacklist_df[~blacklist_df.duplicated('assembly_barcode')]
blacklist = list(blacklist_df['assembly_barcode'])
print("%d genomes added to blacklist" %len(blacklist))
# Remove blacklist genomes from result
# filtered_df = merged_df[]
print("Remove blacklist results")
print("Before: %d seqs" %merged_df.shape[0])
filtered_df = merged_df[~merged_df['assembly_barcode'].isin(blacklist)]
print("After: %d seqs" %filtered_df.shape[0])
# Get number of remaining genomes
print("Number of genomes: %d" %filtered_df[~filtered_df.duplicated('assembly_barcode')].shape[0])

489 genomes added to blacklist
Remove blacklist results
Before: 87627 seqs
After: 84092 seqs
Number of genomes: 5307


In [104]:
pd.DataFrame(blacklist, columns=['genome_name']).to_csv('blacklist.csv', index=False)

In [102]:
# Extract alleles
## Get all the hit with matching serotype
O_matches_s = \
    filtered_df['allele_serotype_'+'O'].notnull()&\
    filtered_df['genome_serotype_'+'O'].notnull()&\
    (filtered_df['allele_serotype_'+'O']==filtered_df['genome_serotype_'+'O'])
H_matches_s = \
    filtered_df['allele_serotype_'+'H'].notnull()&\
    filtered_df['genome_serotype_'+'H'].notnull()&\
    (filtered_df['allele_serotype_'+'H']==filtered_df['genome_serotype_'+'H'])
## From the filtered alleles, we need ones with pident less than 1 but higher than .97
## (blast already filter out <.97)
print("Remove perfect matches")
print("Before: %d seqs" %filtered_df.shape[0])
similar_s = filtered_df['pident'].astype(float)<100
similar_df = filtered_df[(O_matches_s | H_matches_s) & similar_s]
print("After: %d seqs" %similar_df.shape[0])
# Consider gene pair
## If a genome only has unpaired allele gene, remove the unpaired ones
GENE_PAIRS = {'wzx':'wzy', 'wzy':'wzx', 'wzm':'wzt', 'wzt':'wzm'}
paired_df = similar_df.copy()
print()
print("Remove unpaired alleles")
print("Before: %d seqs" %paired_df.shape[0])
for name, group in paired_df.groupby('assembly_barcode'):
    gene_dict = dict(group['allele_gene'].items())
    for a, b in GENE_PAIRS.items():
        if a not in gene_dict.values():
            continue
        if b in gene_dict.values():
            continue
        # Remove both a and b
        for index, gene in gene_dict.items():
            if gene in [a, b]:
                paired_df.drop(index, inplace=True)
print("After: %d seqs" %paired_df.shape[0])
# Remove duplicate sequences
print("Remove duplicates")
print("Before: %d seqs" %paired_df.shape[0])
paired_df = paired_df[~paired_df.duplicated('sseq')]
print("After: %d seqs" %paired_df.shape[0])
print("Number of genomes: %d" %paired_df[~paired_df.duplicated('assembly_barcode')].shape[0])

Remove perfect matches
Before: 84092 seqs
After: 46345 seqs

Remove unpaired alleles
Before: 46345 seqs
After: 44176 seqs
Remove duplicates
Before: 44176 seqs
After: 1254 seqs
Number of genomes: 648


In [20]:
# Read EcOH data
from Bio import SeqIO
EcOH = []
with open('EcOH.fasta') as fh:
    for record in SeqIO.parse(fh, 'fasta'):
        entry = {
            'name': record.name,
            'seq': str(record.seq),
            'desc': record.description
        }
        EcOH.append(entry)
EcOH_df = pd.DataFrame(EcOH)
EcOH_df

Unnamed: 0,desc,name,seq
0,1__fliC__fliC-H1__1 AB028471.1;flagellin;H1,1__fliC__fliC-H1__1,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
1,1__fliC__fliC-H1__2 L07387.1;flagellin;H1,1__fliC__fliC-H1__2,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
2,1__fliC__fliC-H10__3 AY337482.1;flagellin;H10,1__fliC__fliC-H10__3,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
3,1__fliC__fliC-H10__4 AY249995.1;flagellin;H10,1__fliC__fliC-H10__4,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
4,1__fliC__fliC-H11__5 AY337465.1;flagellin;H11,1__fliC__fliC-H11__5,ATGGCACAAGTCATTAATACCAACAGCCTGTCGCTGTTGACCCAGA...
5,1__fliC__fliC-H11__6 AY337472.1;flagellin;H11,1__fliC__fliC-H11__6,ATGGCACAAGTCATTAATACCAACAGCCTGTCGCTGTTGACCCAGA...
6,1__fliC__fliC-H11__7 AY249996.1;flagellin;H11,1__fliC__fliC-H11__7,AACAAATCTCAGTCTTCTCTGAGCTCCGCCATTGAACGTCTCTCTT...
7,1__fliC__fliC-H12__8 AY337471.1;flagellin;H12,1__fliC__fliC-H12__8,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
8,1__fliC__fliC-H12__9 AY337474.1;flagellin;H12,1__fliC__fliC-H12__9,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...
9,1__fliC__fliC-H12__10 AY249997.1;flagellin;H12,1__fliC__fliC-H12__10,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...


In [65]:
# Parse gene and serotype
## Regex
regex_str = '(?<![A-Z])([O|H]\d{1,3})(?!\d)'
## Extract serotype
serotype_s = EcOH_df['desc'].str.extract(regex_str, expand=False)

## Inspect invalid alleles -> All 'novel' allele
invalid_serotype_s = serotype_s[serotype_s.isnull()]
allele_df = EcOH_df.copy()
allele_df['serotype']=serotype_s

## Remove ones with novel serotype
print("Remove novel serotype alleles")
print("Before: %d seqs" %allele_df.shape[0])
allele_df = allele_df[allele_df['serotype'].notnull()]
print("After: %d seqs" %allele_df.shape[0])

## Extract gene
GENE_LIST = ['wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA']
allele_df['gene'] = EcOH_df['desc'].str.extract('('+'|'.join(GENE_LIST)+')', expand=False)
allele_df.count()

## Capitalize all sequences for consisitency
allele_df['seq'] = allele_df['seq'].str.upper()

allele_df.head()

Remove novel serotype alleles
Before: 665 seqs
After: 597 seqs


Unnamed: 0,desc,name,seq,serotype,gene
0,1__fliC__fliC-H1__1 AB028471.1;flagellin;H1,1__fliC__fliC-H1__1,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H1,fliC
1,1__fliC__fliC-H1__2 L07387.1;flagellin;H1,1__fliC__fliC-H1__2,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H1,fliC
2,1__fliC__fliC-H10__3 AY337482.1;flagellin;H10,1__fliC__fliC-H10__3,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H10,fliC
3,1__fliC__fliC-H10__4 AY249995.1;flagellin;H10,1__fliC__fliC-H10__4,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H10,fliC
4,1__fliC__fliC-H11__5 AY337465.1;flagellin;H11,1__fliC__fliC-H11__5,ATGGCACAAGTCATTAATACCAACAGCCTGTCGCTGTTGACCCAGA...,H11,fliC


In [84]:
# Merge with paired_df
df = paired_df.copy()
df2 = allele_df
# Clean results
df = df[['assembly_barcode', 'sseq', 'allele_serotype_O', 'allele_serotype_H', 'allele_gene']].copy()
df.columns = [
    'assembly_barcode', 'seq', 'O_serotype', 'H_serotype', 'gene'
]
df['desc']='from_'+df['assembly_barcode'].copy()
df.drop('assembly_barcode', axis=1, inplace=True)
for name, group in df.groupby('desc'):
    break
# Create general serotype column
df['serotype']=np.nan
df['serotype']=('O'+df['O_serotype']).fillna('H'+df['H_serotype'])
df.drop(['O_serotype', 'H_serotype'], axis=1, inplace=True)
# Remove ones without serotype ('novel' alleles)
print("New allele count is %d" %df.shape[0])
df = df[df['serotype'].notnull()]
print("Old allele count is %d" %df.shape[0])
# Merge with EcOH
df = allele_df.merge(df, how='outer', indicator=True)
print("Total allele count is %d" %df.shape[0])
# Remove duplicated sequences
print()
print("Remove duplicates")
print("Before: %d seqs" %df.shape[0])
df = df[~df.duplicated('seq')]
print("After: %d seqs" %df.shape[0])

df['num']=-1
# Add new name for each sequence
for name, group in df.groupby('serotype'):
    for num, (key, val) in enumerate(group['serotype'].items()):
        df.set_value(key, 'num', num+1)
        pass
df['name']=(df['serotype']+'-'+df['num'].astype(str)+'-'+df['gene'])
### Add 'origin' to EcOH alleles
df['name']=np.where(df['_merge']=='left_only', df['name']+'-origin', df['name'])
df.drop('_merge', axis=1, inplace=True)
allele_nodp_df = df
display(allele_nodp_df)

New allele count is 1254
Old allele count is 1254
Total allele count is 1851

Remove duplicates
Before: 1851 seqs
After: 1747 seqs


Unnamed: 0,desc,name,seq,serotype,gene,num
0,1__fliC__fliC-H1__1 AB028471.1;flagellin;H1,H1-1-fliC-origin,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H1,fliC,1
1,1__fliC__fliC-H1__2 L07387.1;flagellin;H1,H1-2-fliC-origin,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H1,fliC,2
2,1__fliC__fliC-H10__3 AY337482.1;flagellin;H10,H10-1-fliC-origin,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H10,fliC,1
3,1__fliC__fliC-H10__4 AY249995.1;flagellin;H10,H10-2-fliC-origin,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H10,fliC,2
4,1__fliC__fliC-H11__5 AY337465.1;flagellin;H11,H11-1-fliC-origin,ATGGCACAAGTCATTAATACCAACAGCCTGTCGCTGTTGACCCAGA...,H11,fliC,1
5,1__fliC__fliC-H11__6 AY337472.1;flagellin;H11,H11-2-fliC-origin,ATGGCACAAGTCATTAATACCAACAGCCTGTCGCTGTTGACCCAGA...,H11,fliC,2
6,1__fliC__fliC-H11__7 AY249996.1;flagellin;H11,H11-3-fliC-origin,AACAAATCTCAGTCTTCTCTGAGCTCCGCCATTGAACGTCTCTCTT...,H11,fliC,3
7,1__fliC__fliC-H12__8 AY337471.1;flagellin;H12,H12-1-fliC-origin,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H12,fliC,1
8,1__fliC__fliC-H12__9 AY337474.1;flagellin;H12,H12-2-fliC-origin,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H12,fliC,2
9,1__fliC__fliC-H12__10 AY249997.1;flagellin;H12,H12-3-fliC-origin,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAAA...,H12,fliC,3


In [85]:
# Write to ectpyer_data.fasta and ectyper_dict.json
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from collections import defaultdict
import json
ectyper_records = []
ectyper_dict = defaultdict(dict)
for index, row in allele_nodp_df.iterrows():
    antigen = row['serotype'][0]
    ectyper_dict[antigen][row['name']]={
        'allele':row['serotype'],
        'gene':row['gene'],
        'desc': row['desc']
    }
    record = SeqRecord(Seq(row['seq'], IUPAC.IUPACAmbiguousDNA),
                       id=row['name'],
                       description='')
    ectyper_records.append(record)
with open('ectyper_data.fasta', 'w') as fh:
    SeqIO.write(ectyper_records, fh, 'fasta')
with open('ectyper_dict.json', 'w') as fh:
    json.dump(ectyper_dict, fh, indent=4, separators=(',', ': '))

In [96]:
allele_nodp_df[allele_nodp_df['desc'].str.contains('gnd')]

Unnamed: 0,desc,name,seq,serotype,gene,num


In [12]:
blast_df[blast_df['assembly_barcode']=='ESC_HA8550AA_AS']

Unnamed: 0,length,pident,qcovhsp,qlen,qseq,qseqid,sseq,sseqid,assembly_barcode,score
5010,1038,99.33,99,1050,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAGA...,1__fliC__fliC-H4__42,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAGA...,ESC_HA8550AA_AS|O161:H4|NODE_8_length_97842_co...,ESC_HA8550AA_AS,0.983367
5335,1050,99.9,100,1050,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAGA...,1__fliC__fliC-H4__43,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAGA...,ESC_HA8550AA_AS|O161:H4|NODE_8_length_97842_co...,ESC_HA8550AA_AS,0.999
5438,1038,98.55,99,1050,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAGA...,1__fliC__fliC-H4__44,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAGA...,ESC_HA8550AA_AS|O161:H4|NODE_8_length_97842_co...,ESC_HA8550AA_AS,0.975645
5763,1050,99.9,100,1050,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAGA...,1__fliC__fliC-H4__45,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAGA...,ESC_HA8550AA_AS|O161:H4|NODE_8_length_97842_co...,ESC_HA8550AA_AS,0.999
56922,1050,99.81,100,1050,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAGA...,fliC_54_AJ605766_H17,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAGA...,ESC_HA8550AA_AS|O161:H4|NODE_8_length_97842_co...,ESC_HA8550AA_AS,0.9981
57124,1050,99.24,100,1050,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGAACACTCAGA...,fliC_50_AJ515904_H17,ATGGCACAAGTCATTAATACCAACAGCCTCTCGCTGATCACTCAGA...,ESC_HA8550AA_AS|O161:H4|NODE_8_length_97842_co...,ESC_HA8550AA_AS,0.9924
