# Document-Term Matrix

Sparse representation of the occurence of terms in a document.

Used in Latent Semantic Analysis (LSA)

In [None]:
# Collection of documents (corpus)

review_1 = "The Glider II is a great soccer ball"
review_2 = "What a bad soccer ball"
review_3 = "I am happy with The glider"

In [None]:
docs = [review_1, review_2, review_3]
docs

In [None]:
# Gather the set of all unique terms

unique_terms = {term for doc in docs for term in doc.split()}
unique_terms

In [None]:
# Construct a term-document matrix
# here as a Python dictionary for ease of interpretability

doc_term_matrix = {}

for term in unique_terms:
    doc_term_matrix[term] = []
    
    for doc in docs:
        if term in doc:
            doc_term_matrix[term].append(1)
        else: doc_term_matrix[term].append(0)

doc_term_matrix

In [None]:
# The query to find all documents containing "Glider" AND "soccer"
# Is just a bitwise AND:

import numpy as np

docs_array = np.array(docs, dtype='object')

v1 = np.array(doc_term_matrix['Glider'])    
v2 = np.array(doc_term_matrix['soccer'])

print(v1)
print(v2)
print('-------')
v3 = v1 & v2
print(v3)

In [None]:
# We can now get the matching documents from our corpus with the result
[doc for doc in v3 * docs_array if doc]

In [None]:
v1 = np.array(doc_term_matrix['a'])    
v2 = np.array(doc_term_matrix['ball'])

print(v1)
print(v2)
print('-------')
v3 = v1 & v2
print(v3)

In [None]:
[doc for doc in v3 * docs_array if doc]

In [None]:
# Bitwise OR to construct 'this' or 'that' queries.

In [None]:
v1 = np.array(doc_term_matrix['great'])    
v2 = np.array(doc_term_matrix['bad'])

print(v1)
print(v2)
print('-------')
v3 = v1 | v2
print(v3)

In [None]:
[doc for doc in v3 * docs_array if doc]