In [77]:
! pip install scikit-learn pandas tabulate -q

In [78]:
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='all')
target_names = newsgroups['target_names']

In [79]:
import pandas as pd

df = pd.DataFrame({
    'documents': newsgroups['data'],
    # 'target_names': newsgroups['target_names'],
    'target': [ target_names[_] for _ in newsgroups['target'] ]
})

In [80]:
import random
random.seed(20)

emails = df['documents'].to_list()
shuffled_emails = random.sample(emails, len(emails))[:20]

# print(df.iloc[1, 0])

In [96]:
from math import log
import re
from typing import Dict, List
from collections import Counter
from tabulate import tabulate

NON_WORDS = re.compile("[^a-z' ]")

class Corpus:

    def __init__(self, corpus: List[str]):
        self.corpus = [ NON_WORDS.sub(' ', document.lower()).split() for document in corpus ]
        self.document_set = [ set(document) for document in self.corpus ]

    def get_rank(self, document_id: int):
        count_dict = Counter(self.corpus[document_id])

        word_list = []
        tf = []
        df = []
        idf = []
        freq = []
        tf_idf = []

        for word, value in count_dict.items():
            word_list.append(word)

            _tf = value / len(count_dict.items())
            _df = len([ document for document in self.document_set if word in document ])
            _idf = log(len(self.corpus) /  _df)

            tf.append(_tf)
            idf.append(_idf)
            freq.append(value)
            tf_idf.append(_tf * _idf)
            df.append(_df)

        return {
            'words': word_list,
            'freq': freq,
            'tf': tf,
            'idf': idf,
            'tf-idf': tf_idf,
            'df': df
        }
    
    def show_df(self, table: Dict[str, List[float]], limit: int = 3):
        table_data = list(zip(table['words'], table['freq'], table['tf'], table['idf'], table['tf-idf'], table['df']))

        # Sort by TF*IDF in descending order
        table_data.sort(key=lambda x: x[4], reverse=True)

        print(tabulate(table_data[:limit], headers=["Word", "Frequency", "TF", "IDF", "TF*IDF", "DF"], tablefmt="pretty"))

# Assuming shuffled_emails is your corpus data
corpus = Corpus(shuffled_emails)
table = corpus.get_rank(2)
corpus.show_df(table, 5)


+-----------+-----------+---------------------+-------------------+---------------------+----+
|   Word    | Frequency |         TF          |        IDF        |       TF*IDF        | DF |
+-----------+-----------+---------------------+-------------------+---------------------+----+
| colorado  |     5     | 0.06756756756756757 | 2.302585092994046 | 0.15558007385094905 | 2  |
|   spot    |     3     | 0.04054054054054054 | 2.995732273553991 | 0.12144860568462126 | 1  |
| dickhead  |     3     | 0.04054054054054054 | 2.995732273553991 | 0.12144860568462126 | 1  |
| franjion  |     2     | 0.02702702702702703 | 2.995732273553991 | 0.08096573712308083 | 1  |
| franjione |     2     | 0.02702702702702703 | 2.995732273553991 | 0.08096573712308083 | 1  |
+-----------+-----------+---------------------+-------------------+---------------------+----+
