In [66]:
! pip install scikit-learn pandas tabulate -q

In [67]:
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='all')
target_names = newsgroups['target_names']

In [68]:
import pandas as pd

df = pd.DataFrame({
    'documents': newsgroups['data'],
    # 'target_names': newsgroups['target_names'],
    'target': [ target_names[_] for _ in newsgroups['target'] ]
})

In [69]:
import random
random.seed(20)

emails = df['documents'].to_list()
shuffled_emails = random.sample(emails, len(emails))[:20]

# print(df.iloc[1, 0])

In [84]:
from math import log
import re
from typing import List
from collections import Counter
from tabulate import tabulate

NON_WORDS = re.compile("[^a-z' ]")

class Corpus:

    def __init__(self, corpus: List[str]):
        self.corpus = [ NON_WORDS.sub(' ', document.lower()).split() for document in corpus ]
        self.document_set = [ set(document) for document in self.corpus ]

    def show_tf(self, document_id: int):
        count_dict = Counter(self.corpus[document_id])

        word_list = []
        tf = []
        df = []
        idf = []
        freq = []
        tf_idf = []

        for word, value in count_dict.items():
            word_list.append(word)

            _tf = value / len(count_dict.items())
            _df = len([ document for document in self.document_set if word in document ])
            _idf = log(len(self.corpus) / _df)

            tf.append(_tf)
            idf.append(_idf)
            freq.append(value)
            tf_idf.append(_tf * _idf)
            df.append(_df)

        table = zip(word_list, freq, tf, idf, tf_idf, df)
        print(tabulate(table, headers=["Word", "Frequency", "TF", "IDF", "TF*IDF", "DF"], tablefmt="pretty"))

corpus = Corpus(shuffled_emails)
corpus.show_tf(3)

+-----------------+-----------+----------------------+---------------------+-----------------------+----+
|      Word       | Frequency |          TF          |         IDF         |        TF*IDF         | DF |
+-----------------+-----------+----------------------+---------------------+-----------------------+----+
|      from       |     1     | 0.014492753623188406 |         0.0         |          0.0          | 20 |
|     glover      |     3     | 0.043478260869565216 |  2.995732273553991  |  0.1302492292849561   | 1  |
|      tafs       |     1     | 0.014492753623188406 |  2.995732273553991  |  0.04341640976165204  | 1  |
|      mitre      |     3     | 0.043478260869565216 |  2.995732273553991  |  0.1302492292849561   | 1  |
|       org       |     2     | 0.028985507246376812 | 1.8971199848858813  | 0.054988985069155984  | 3  |
|     graham      |     2     | 0.028985507246376812 |  2.995732273553991  |  0.08683281952330409  | 1  |
|        k        |     2     | 0.028985507246