# Inverted Index

Associate a collection of terms (lexicon) with the documents that contain those terms.

The data structure is much more dense than a Document Term Matrix.


In [66]:
# Collection of documents (corpus)

review_1 = "The Glider II is a great soccer ball"
review_2 = "What a bad soccer ball"
review_3 = "I am happy with The glider"

In [67]:
docs = [review_1, review_2, review_3]
docs

['The Glider II is a great soccer ball',
 'What a bad soccer ball',
 'I am happy with The glider']

In [68]:
# Gather the set of all unique terms

unique_terms = {term for doc in docs for term in doc.split()}
unique_terms

{'Glider',
 'I',
 'II',
 'The',
 'What',
 'a',
 'am',
 'bad',
 'ball',
 'glider',
 'great',
 'happy',
 'is',
 'soccer',
 'with'}

In [69]:
# Construct an inverted index
# here as a Python dictionary for ease of interpretability

inverted_index = {}

for i, doc in enumerate(docs):
    for term in doc.split():
        if term in inverted_index:
            inverted_index[term].add(i)
        else: inverted_index[term] = {i}

inverted_index

{'The': {0, 2},
 'Glider': {0},
 'II': {0},
 'is': {0},
 'a': {0, 1},
 'great': {0},
 'soccer': {0, 1},
 'ball': {0, 1},
 'What': {1},
 'bad': {1},
 'I': {2},
 'am': {2},
 'happy': {2},
 'with': {2},
 'glider': {2}}

In [70]:
# Now we can get posting lists for any term

In [71]:
posting_list = inverted_index['soccer']
posting_list

{0, 1}

In [72]:
# We can use the posting lists to locate the documents by ID (here just their ordering in the documents array)
# Think of this as a hash table, or a distributed hash table for much larger scenarios

In [73]:
from operator import itemgetter 

res_list = set(itemgetter(*posting_list)(docs))
res_list

{'The Glider II is a great soccer ball', 'What a bad soccer ball'}

In [74]:
# Notice now we can perform boolean operations on postings lists for Boolean search operations

In [75]:
posting_list_1 = inverted_index['soccer']
posting_list_2 = inverted_index['glider']

print(posting_list_1)
print(posting_list_2)

posting_list = posting_list_1 | posting_list_2

# union the results (OR operation)
search_result = set(itemgetter(*posting_list)(docs))

# likewise we could calculate the intersection (AND operation)

{0, 1}
{2}


In [76]:
search_result

{'I am happy with The glider',
 'The Glider II is a great soccer ball',
 'What a bad soccer ball'}