# 3.2. Set membership with Bloom filters

## Set membership

In [None]:
import nbimporter
import Background as utils

# These AMR genes are downloaded from the ResFinder database (accessed 05/2017):
## blaIMP-1_1_DQ522237
geneA = "ATGAGCAAGTTATCTGTATTCTTTATATTTTTGTTTTGCAGCATTGCTACCGCAGCAGAGTCTTTGCCAGATTTAAAAATTGAAAAGCTTGATGAAGGCGTTTATGTTCATACTTCGTTTAAAGAAGTTAACGGGTGGGGCGTTGTTCCTAAACATGGTTTGGTGGTTCTTGTAAATGCTGAGGCTTACCTAATTGACACTCCATTTACGGCTAAAGATACTGAAAAGTTAGTCACTTGGTTTGTGGAGCGTGGCTATAAAATAAAAGGCAGCATTTCCTCTCATTTTCATAGCGACAGCACGGGCGGAATAGAGTGGCTTAATTCTCGATCTATCCCCACGTATGCATCTGAATTAACAAATGAACTGCTTAAAAAAGACGGTAAGGTTCAAGCCACAAATTCATTTAGCGGAGTTAACTATTGGCTAGTTAAAAATAAAATTGAAGTTTTTTATCCAGGCCCGGGACACACTCCAGATAACGTAGTGGTTTGGTTGCCTGAAAGGAAAATATTATTCGGTGGTTGTTTTATTAAACCGTACGGTTTAGGCAATTTGGGTGACGCAAATATAGAAGCTTGGCCAAAGTCCGCCAAATTATTAAAGTCCAAATATGGTAAGGCAAAACTGGTTGTTCCAAGTCACAGTGAAGTTGGAGACGCATCACTCTTGAAACTTACATTAGAGCAGGCGGTTAAAGGGTTAAACGAAAGTAAAAAACCATCAAAACCAAGCAACTAA"

## blaIMP-2_1_AJ243491
geneB = "ATGAAGAAATTATTTGTTTTATGTGTATGCTTCCTTTGTAGCATTACTGCCGCGGGAGCGCGTTTGCCTGATTTAAAAATCGAGAAGCTTGAAGAAGGTGTTTATGTTCATACATCGTTCGAAGAAGTTAACGGTTGGGGTGTTGTTTCTAAACACGGTTTGGTGGTTCTTGTAAACACTGACGCCTATCTGATTGACACTCCATTTACTGCTACAGATACTGAAAAGTTAGTCAATTGGTTTGTGGAGCGCGGCTATAAAATCAAAGGCACTATTTCCTCACATTTCCATAGCGACAGCACAGGGGGAATAGAGTGGCTTAATTCTCAATCTATTCCCACGTATGCATCTGAATTAACAAATGAACTTCTTAAAAAAGACGGTAAGGTGCAAGCTAAAAACTCATTTAGCGGAGTTAGTTATTGGCTAGTTAAAAATAAAATTGAAGTTTTTTATCCCGGCCCGGGGCACACTCAAGATAACGTAGTGGTTTGGTTACCTGAAAAGAAAATTTTATTCGGTGGTTGTTTTGTTAAACCGGACGGTCTTGGTAATTTGGGTGACGCAAATTTAGAAGCTTGGCCAAAGTCCGCCAAAATATTAATGTCTAAATATGTTAAAGCAAAACTGGTTGTTTCAAGTCATAGTGAAATTGGGGACGCATCACTCTTGAAACGTACATGGGAACAGGCTGTTAAAGGGCTAAATGAAAGTAAAAAACCATCACAGCCAAGTAACTAA"

# get the canonical k-mers for each gene
kmersA = utils.getKmers(geneA, 7)
kmersB = utils.getKmers(geneB, 7)

# how many k-mers do we have
print("blaIMP-1 is {} bases long and has {} k-mers (k=7)" .format(len(geneA), len(kmersA)))
print("blaIMP-2 is {} bases long and has {} k-mers (k=7)" .format(len(geneA), len(kmersB)))


* create a Bloom filter:

In [None]:
from bloom_filter import BloomFilter
sketch = BloomFilter(max_elements=1000, error_rate=0.1)

* add one element and test the sketch:

In [None]:
# get a single k-mer to add
testKmer=kmersA[0]

# check it is not already in the sketch
if testKmer in sketch:
    print("error: sketch should be empty")

# add the k-mer
sketch.add(testKmer)

# check the sketch to see if the k-mer was added
if testKmer in sketch:
    print("k-mer {} is in the Bloom filter" .format(testKmer))

* now, add all the k-mers from geneA to the sketch:

In [None]:
for kmer in kmersA:
    sketch.add(kmer)

* use the Bloom filter to see approximately how many k-mers from geneB are present in geneA:

In [None]:
count = 0
for kmer in kmersB:
    if kmer in sketch:
        count += 1

print("approximately {} k-mers are shared between gene A and gene B" .format(count))

* to get the actual number of shared k-mers, we can get the intersection of kmersA and kmersB by looping through one set and checking it against the other.

In [None]:
count = 0
for kmer in kmersA:
    if kmer in kmersB:
        count += 1

print( "actual number of k-mers shared between gene A and gene B is: {}" .format(count))