The task is to calculate TF-IDF measure using Python.

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/valentine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# download BBC text classification dataset
# original dataset on Kaggle: https://www.kaggle.com/datasets/shivamkushwaha/bbc-full-text-document-classification)
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

File ‘bbc_text_cls.csv’ already there; not retrieving.



In [4]:
# save the dataset in Pandas dataframe
df = pd.read_csv('bbc_text_cls.csv')

In [5]:
# check the dataset
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [6]:
df['labels'].unique()

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [7]:
# create a dictionary of words to indices, tokenize documents and convert them to lists of integers
idx = 0
word2idx = {}
tokenized_docs = []
for doc in df['text']:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1
            
        doc_as_int.append(word2idx[word])
    tokenized_docs.append(doc_as_int)

In [8]:
# create a dictionary of indeces to words
idx2word = {v:k for k, v in word2idx.items()}

In [9]:
# create a TF matrix of size (N, W) where N is the number of documents, W is the number of words
N = df['text'].shape[0]
W = len(word2idx)

tf = np.zeros((N, W))

In [10]:
# fill up the TF matrix
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i, j] += 1

In [11]:
# calculate IDF
doc_freq = np.sum(tf > 0, axis=0)
idf = np.log(N / doc_freq)

In [12]:
# create TF-IDF matrix
tf_idf = tf * idf

In [47]:
# print out top 5 terms from a random document in each topic
for topic in df['labels'].unique():
    row = df[df['labels'] == topic].sample()
    i = row.index[0]
    print('Label:', topic)
    print('Text:', row['text'].str.split('\n').str[0].values[0])
    print('Top 5 terms:')
    scores = tf_idf[i]
    indices = (-scores).argsort()
    for j in indices[:5]:
        print(idx2word[j])
    print()

Label: business
Text: Parmalat sues 45 banks over crash
Top 5 terms:
parmalat
banks
italian
institutions
sued

Label: entertainment
Text: Rapper films music video in jail
Top 5 terms:
c-murder
filmed
jail
rapper
video

Label: politics
Text: Donor attacks Blair-Brown 'feud'
Top 5 terms:
bannatyne
labour
cook
blair-brown
brown

Label: sport
Text: Davies favours Gloucester future
Top 5 terms:
davies
ospreys
gloucester
hooker
wales

Label: tech
Text: Europe backs digital TV lifestyle
Top 5 terms:
nem
drm
digital
content
media

