https://en.wikipedia.org/wiki/Tf%E2%80%93idf

In [1]:
import math
import os
import fnmatch # https://docs.python.org/3/library/fnmatch.html
import glob
import doctest

Source https://stevenloria.com/tf-idf/ <BR>

Caveat: this post now uses TextBlob for breaking up the text into words and getting the word counts.

In [3]:
def term_freq(term, list_of_words_in_document):
    """
    computes "term frequency" which is the number of times a word appears in a document, 
    normalized by dividing by the total number of words in document. 
    
    >>> term_freq('asdf',['asdf','lmka','mignasf'])
    0.3333333333333333
    """
    return list_of_words_in_document.count(term)/(len(list_of_words_in_document)*1.0)

In [4]:
doctest.testmod()

TestResults(failed=0, attempted=1)

In [5]:
def number_of_documents_containing(term,all_documents):
    """
    Returns the number of documents containing word. 
    
    >>> all_doc = [['asdf','ionasd'],['igag'],['ngi','adfmig','mgiaf']]
    >>> number_of_documents_containing('asdf',all_doc)
    1
    """
    countr=0
    for this_doc in all_documents:
        if (term in this_doc):
            countr+=1
    return countr

In [6]:
doctest.testmod()

TestResults(failed=0, attempted=3)

In [7]:
def inverse_doc_freq(term, all_documents):
    """
    computes "inverse document frequency" which measures 
    how common a word is among all documents in corpus. 
    The more common a word is, the lower its idf. 
    Take the ratio of the total number of documents to
    the number of documents containing word, 
    then take the log of that. Add 1 to the divisor to prevent division by zero.
    
    >>> all_doc = [['asdf','ionasd'],['igag'],['ngi','adfmig','mgiaf']]
    >>> number_of_documents_containing('asdf',all_doc)
    1
    """
    return math.log(len(all_documents) / ( 1.0 + number_of_documents_containing(term, all_documents)))

In [8]:
doctest.testmod()

TestResults(failed=0, attempted=5)

In [10]:
def tfidf(term, list_of_words_in_document, all_documents):
    """
    computes the TF-IDF score. It's the product of tf and idf.
    
    >>> all_doc = [['asdf','ionasd'],['igag']]
    >>> tfidf('asdf',all_doc[0],all_doc)
    0.0
    
    >>> all_doc = [['asdf','ionasd'],['igag','geaag'],['ngi','adfmig','mgiaf']]
    >>> tfidf('igag',all_doc[1],all_doc)
    0.2027325540540822
    """
    return term_freq(term, list_of_words_in_document) * inverse_doc_freq(term, all_documents)

In [11]:
doctest.testmod()

TestResults(failed=0, attempted=9)

The \*.dat files in the directory have only key words from each file

Convert the .dat contents to lists per document

In [12]:
all_documents={}
all_words_from_all_docs=[]
all_terms=[]
foldr='essays/'
fname='*.dat'

for file_name in glob.glob('essays_word_per_file/*.dat'):
    with open(file_name,'r') as fil:
        words_in_file=fil.read().split("\n")
    # remove empty strings from list of words
    while "" in words_in_file:
        words_in_file.remove("")
    # save the words per file as value in a dictionary
    all_documents[file_name]=words_in_file
    print(len(words_in_file),'words in',file_name)
    # also save all the words to a list
    for this_word in words_in_file:
        all_words_from_all_docs.append(this_word)
            

66 words in essays_word_per_file/week2 summary .docx.dat
141 words in essays_word_per_file/week1_reading Summary.docx.dat
140 words in essays_word_per_file/week1_Data 601- Summary of The History of Data Science .docx.dat
78 words in essays_word_per_file/week2_summary-Data Wrangling with Python  ch2 p17 to 40.docx.dat
100 words in essays_word_per_file/week1_a History of Data Science.docx.dat
80 words in essays_word_per_file/week1_Data Wrangling Chap 2.docx.dat
96 words in essays_word_per_file/week1_a Very Short History Of Data Science_1.docx.dat
115 words in essays_word_per_file/week1_ Summary.docx.dat
140 words in essays_word_per_file/week1_summary.docx.dat
49 words in essays_word_per_file/week1_1.txt.dat
119 words in essays_word_per_file/week2_Lists and Dictionaries Summary.docx.dat
130 words in essays_word_per_file/week2_Data Wrangling with Python page 17 to 40.docx.dat
81 words in essays_word_per_file/week2_Week 2 Reading Summary.docx.dat
109 words in essays_word_per_file/Week2_summ

In [13]:
len(all_documents)

24

In [14]:
len(set(all_words_from_all_docs))

1265

In [15]:
all_words_from_all_docs = list(set(all_words_from_all_docs))
len(all_words_from_all_docs)

1265

Sample sizes are small, so results are not reliable representations of the document

In [16]:
dict_of_docs={}
for doc_name, word_list_in_this_doc in all_documents.items():
    if (len(word_list_in_this_doc)==0):
        print("error: empty input file"+doc_name)
    else:
        #print('\n'+doc_name)
        dic_of_terms={}
        for this_term in word_list_in_this_doc:
            dic_of_terms[this_term] = tfidf(this_term, word_list_in_this_doc, all_words_from_all_docs)
        #print(dic_of_terms)
        dict_of_docs[doc_name]=dic_of_terms

since "data" appears in almost every document (these are data science essays), the word "data" is ranked low by TF-IDF

In [17]:
for doc_name, dic_of_terms in dict_of_docs.items():
    print(doc_name)
    terms_in_doc_sorted_by_score = sorted(dic_of_terms.items(), key=lambda x: x[1], reverse=True)
    print('  top 5:')
    for word_and_score in terms_in_doc_sorted_by_score[0:5]:
        print(word_and_score)
    print('  bottom 5:')
    for word_and_score in terms_in_doc_sorted_by_score[-5:]:
        print(word_and_score)

essays_word_per_file/week2 summary .docx.dat
  top 5:
('fix', 0.09772242758487387)
('naming', 0.09772242758487387)
('learning', 0.09772242758487387)
('entry', 0.09772242758487387)
('various', 0.09772242758487387)
  bottom 5:
('end', 0.08107678684747827)
('type', 0.08107678684747827)
('one', 0.07333700466920569)
('data', 0.07189291103580682)
('go', 0.0671935939402941)
essays_word_per_file/week1_reading Summary.docx.dat
  top 5:
('constraint', 0.04574241291206862)
('idea', 0.04574241291206862)
('suggesting', 0.04574241291206862)
('using', 0.04574241291206862)
('perspective', 0.04574241291206862)
  bottom 5:
('ever', 0.03795083639669195)
('line', 0.03507519733209504)
('ten', 0.03507519733209504)
('data', 0.03365200091037766)
('act', 0.030159259881315285)
essays_word_per_file/week1_Data 601- Summary of The History of Data Science .docx.dat
  top 5:
('exploring', 0.04606914443286911)
('storing', 0.04606914443286911)
('done', 0.04606914443286911)
('using', 0.04606914443286911)
('requiring', 