In [None]:
! pip install -U pip setuptools wheel

In [None]:
! pip install -U spacy

In [None]:
! python -m spacy download en_core_web_sm

In [None]:
! pip install bitarray

In [None]:
! pip install mmh3

In [1]:
import spacy    
nlp = spacy.load("en_core_web_sm")
# working with food (like a fixed set or something)
# adding custom words to stop words to remove non-informational content
# Since no-one would ever ask to go to a bad food place - it doesn't proide any informational detail
# in queries about food - generally the following terms would be very frequent and thus carry less information and have a significantly lower tf-idf value

nlp.Defaults.stop_words |= {"nice","good","like", "love", "i", "we", "me", "where", "how", "which", "when", "you", "u", "tell", "can"}
required_pattern = {"food", "restaurants", "food-places", "hotels", "takeaway", "eatery", "pub", "eat", "drink"}

In [2]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [11]:
def generate_n_grams(n, word_list):
    if not isinstance(word_list, list):
        raise ValueError("Word list must be a list of words over which ngrams have to generated")
    ngrams = zip(*[word_list[i:] for i in range(n)]) # O(n)
    return [" ".join(ngram) for ngram in ngrams]

In [9]:
def extract_relevant_terms(input_string):
    # this is O(n)
    data = [token.text.lower() for token in nlp(input_string) if token.text.lower() not in nlp.Defaults.stop_words]
    
    # O(n)
    search_result = any(i for i in data if i in required_pattern)
    if not search_result: return None
    
    queries = generate_n_grams(n=2, word_list=data)
    return queries

### Using Set storage

In [2]:
location_cuisine_map = {
    "indian",
    "west indian",
    "thai",
    "sushi",
    "chinese",
    "carribean",
    "italian",
    "pub",
    "bbq",
    "portugese",
    "spanish",
    "french",
    "east european"
}

In [6]:

def get_match(input_string):
    
    queries = extract_relevant_terms(input_string)
    if not queries:
        return None
    
    # worst case(o(n))
    for query in queries:
        if query in location_cuisine_map:
            return input_string, query
    return None

In [7]:
%%timeit
get_match("Which restaurants do West Indian food")

3.11 ms ± 85.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
get_match("Which restaurants do West Indian food")

('Which restaurants do West Indian food', 'west indian')

In [9]:
%%timeit
get_match("What is the weather today")

3.08 ms ± 274 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
get_match("What is the weather today")

In [11]:
import sys

In [12]:
sys.getsizeof(location_cuisine_map)

744

# Bloom Filter

1. once uttered, you generally want to remember the utterances. Elements once added in the bloomfilter cannot be removed, unless we use an invertible bloom filter (this can be implemented with more ease using a list)
2.  A bloom filter can have fale positives but no false negatives. In this situation, we can afford false positives because in a query of food, we can generally be given all related options
3. Once added, elements cannot be removed from the bloom filter - which again serves our purpose. You wouldn't remove data added to cuisine map generally


#### Trying to optimize space a little

In [1]:
import sys

In [2]:
set_data = set()
dict_data = dict()
list_data = list()
tuple_data = tuple()

In [3]:
sys.getsizeof(list_data)

72

In [4]:
sys.getsizeof(set_data)

232

In [5]:
sys.getsizeof(dict_data)

248

In [6]:
sys.getsizeof(tuple_data)

56

### Memory Details for a bloom filter and choice of data structure

- traditionally bloomfilter is a bitarray. We can use a list here to construct a bloom filter. The size of the filter (no. of sparse spaces in the filter and the number of elements in the filter) is directly proportional to the risk of collision in the filter
- I assume here that space is not a limitation. As shown below, a list should take about 8MB for 1 million strings. Given list uses heap memory - using it for implementing a bloom filter should be okay.

- Both list and tuple give same memory in terms of storing large number of string. We use list for the ease of adding elements to the bloom filter.

In [7]:

list_data = [f'arandondomverylongsensentencefor memory check {i}' for i in range(0, 1000000)]
tuple_data = tuple(list_data)

In [8]:
sys.getsizeof(tuple_data)*0.000001 # 8MB - original value of getszeof is in bytes

8.000055999999999

In [9]:
sys.getsizeof(list_data)*0.000001 

8.697472

In [4]:
from .bloom_filter import BloomFilter

In [11]:
n = 1000000
p = 0.09

In [12]:
blf = BloomFilter(n=n, p=p, data_type="list")

In [13]:
for i in range(0, 1000000):
    blf.insert(f'arandondomverylongsensentencefor memory check {i}')

In [14]:
blf.get_size_of_filter() # value heavily dependent on p and n
# look up time is o(1)
# for n == 1000000, p=0.09 the actual length of the filter is 5011823

40.094656

In [15]:
# comparison with set
for i in range(0, 1000000):
    set_data.add(f'arandondomverylongsensentencefor memory check {i}')

In [16]:
sys.getsizeof(set_data) * 0.000001

33.554663999999995

However, if we use a bitarray as the datatype - the memory usage becomes negligible. This is because cpyton uses additional memory for its objects - as can be seen by memory associated with empty objects

In [17]:
# by default it uses bitarray
n = 1000000
p = 0.01
blf = BloomFilter(n=n, p=p)
for i in range(0, 1000000):
    blf.insert(f'arandondomverylongsensentencefor memory check {i}')

In [18]:
blf.get_size_of_filter()

0.6265419999999999


From the above comparison - it can be seen that bitarray is a great choice if we want to search for elements in a huge group. It is optimized in memory and gives o(1) speed.

### Other details about BloomFilter


1. False Positive Probablity - Given the size of filter m, and number of elements n- the false positive probablity is given by - 

$$P = (1 - [1 - \frac 1 m]^{kn})^k$$

2. Optimal Size of the filter: Given desired probablity of false positives and n being the number of elements that are to be placed in the filter, we have:

$$m = -\frac{n\ln p}{(\ln 2)^2}$$

3. No. of hash functions: Given a filter of size m and we have n elements to be inserted, the optimal number for hash functions is:  

$$k = \frac{m} {n}\ln2$$

4. We need fast independent hash functions which are uniformly distributed. Here I am using mm3. We can also create multiple hash functions using md5, sha1 and sha224 - however these can be computationally expensive

In [5]:
location_cuisine_map = {
    "indian",
    "west indian",
    "thai",
    "sushi",
    "chinese",
    "carribean",
    "italian",
    "pub",
    "bbq",
    "portugese",
    "spanish",
    "french",
    "east european"
}

n = 1000000
p = 0.01
blf = BloomFilter(n=n, p=p)
for item in location_cuisine_map:
    blf.insert(item)

In [6]:
blf.get_size_of_filter() # value in MB

1.198197

In [7]:
def check_filter(input_string):
    queries = extract_relevant_terms(input_string)
    if not queries:
        return None
    
    # worst case(o(n) where n is the length of n_grams contained)
    for query in queries:
        if blf.is_present(query):
            return input_string, query
    return None

In [12]:
%%timeit
result = check_filter("Which restaurants do West Indian food")

2.88 ms ± 34.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
check_filter("Which restaurants do West Indian food")

('Which restaurants do West Indian food', 'west indian')

In [14]:
%%timeit
check_filter("What is the weather today")

3.21 ms ± 247 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
check_filter("What is the weather today")

I think Bloom Filter gives benefits in memory with speed comparable to hash map look up