# Term-Document Matrix

In [1]:
import os, sys, spacy

import pandas as pd
# import gensim.downloader as api

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

from data_processing import DataProcessing

[Speech and Language Processing (3rd ed. draft)](https://web.stanford.edu/~jurafsky/slp3/) --- CH 6
![Term Document Matrix](term_doc.png)

In [2]:
def extract_terms(doc):
    """Extract terms (also called words) from a document (text file or string).

    Parameters:
    -----------
    doc: `str`
        A document with words

    Returns:
    --------
    words: `list`
        The words from the doc 
    """

    return doc.split()

In [3]:
document_1 = "I'm a Kingdom Man Man Kingdom"
document_2 = "I'm Kingdom Man I'm a Kingdom Man I'm a Kingdom Man"


In [4]:
d1_words = extract_terms(document_1)
d2_words = extract_terms(document_2)
docs_words = []
docs_words.append(d1_words)
docs_words.append(d2_words)

In [5]:
def get_word_counts(words: list[str], doc_name: str): 
    word_count_dic = {}
    for word in words:
        if word in word_count_dic:
            word_count_dic[word] = word_count_dic[word] + 1

        else: # Word not in 
            word_count_dic[word] = 1
    
    word_count_df = pd.DataFrame(word_count_dic.keys(), columns=["Word"])
    word_count_df[f"{doc_name} Count"] = word_count_dic.values()
    return word_count_df

In [6]:
d1_word_count_df = get_word_counts(d1_words, doc_name="Doc 1")
d1_word_count_df

Unnamed: 0,Word,Doc 1 Count
0,I'm,1
1,a,1
2,Kingdom,2
3,Man,2


In [7]:
d2_word_count_df = get_word_counts(d2_words, "Doc 2")
d2_word_count_df

Unnamed: 0,Word,Doc 2 Count
0,I'm,3
1,Kingdom,3
2,Man,3
3,a,2


In [8]:
def create_term_doc_matrix(docs_words: list, doc_name: str, group_by_col_name: str) -> pd.DataFrame:
    dfs = []
    for doc_word_idx in range(len(docs_words)):
        doc_word = docs_words[doc_word_idx]
        
        doc_N = doc_word_idx + 1
        df = get_word_counts(doc_word, f"{doc_name} {doc_N}")
        dfs.append(df)

    docs_df = DataProcessing.concat_dfs(dfs)
    term_doc_matrix = docs_df.groupby(group_by_col_name).sum()

    return term_doc_matrix