In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize

In [2]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [4]:
# Populate word2idx
# Convert documents into sequences of ints /ids/ indices

idx = 0
word2idx = {}
tokenized_docs = []

for doc in df['text']:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx = idx+1
        # Save it for later  
        doc_as_int.append(word2idx[word]) # Storing integer representation of words with repetition
    tokenized_docs.append(doc_as_int)

In [5]:
word2idx.items()



In [6]:
# Reverse Mapping
idx2word = {index:wrd for wrd, index in word2idx.items()}
print(idx2word)



In [10]:
print(sorted(doc_as_int))

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 19, 19, 19, 19, 19, 19, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,

In [12]:
tokenized_docs

[[0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  16,
  26,
  27,
  28,
  17,
  29,
  30,
  31,
  23,
  32,
  27,
  33,
  34,
  35,
  36,
  37,
  23,
  38,
  39,
  40,
  41,
  27,
  42,
  28,
  1,
  37,
  43,
  44,
  45,
  46,
  47,
  48,
  1,
  31,
  12,
  49,
  50,
  51,
  1,
  52,
  53,
  15,
  16,
  17,
  54,
  28,
  17,
  55,
  31,
  56,
  7,
  57,
  58,
  59,
  60,
  61,
  33,
  62,
  63,
  5,
  64,
  8,
  4,
  65,
  27,
  46,
  66,
  67,
  22,
  68,
  31,
  3,
  4,
  49,
  69,
  70,
  71,
  72,
  35,
  73,
  74,
  15,
  37,
  75,
  41,
  31,
  76,
  56,
  77,
  44,
  78,
  27,
  68,
  27,
  79,
  80,
  81,
  82,
  31,
  72,
  83,
  84,
  85,
  40,
  23,
  50,
  51,
  7,
  57,
  86,
  87,
  40,
  23,
  88,
  24,
  89,
  31,
  90,
  27,
  23,
  91,
  49,
  68,
  92,
  93,
  5,
  94,
  95,
  96,
  52,
  74,
  15,
  69,
  23,
  97,
  37,
  98,
  44,
  99,
  100,
  31,
  72,
  101,
  16,

In [15]:
# Number of documents. i.e number of rows in the dataframe
N = len(df['text'])
# Number of unique words
V = len(word2idx)
# length of doc_as_int
len_doc = len(doc_as_int)
print("Length of documents: ", N)
print("Length of words: ", V)
print("Length of doc as int: ", len_doc)

Length of documents:  2225
Length of words:  34762
Length of doc as int:  490


In [16]:
# Instantiate term-frequency matrix
# Note: Could have also used count Vectorizer
tf = np.zeros((N, V))

In [17]:
# Populate term frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i, j] += 1

In [18]:
# Compute the IDF term
document_freq = np.sum(tf >0, axis=0) # Document frequency (Shape = (V, ))
idf = np.log(N/document_freq)

In [19]:
# Compute TF-IDF 
tf_idf = tf * idf

In [20]:
# Generate random seed to get consistent answer
np.random.seed(123)

In [22]:
# Pick a random document, show the top 5 terms ( in terms of tf_idf score)
i = np.random.choice(N)
row = df.iloc[i]
print("Label: ", row['labels'])
print("Text: ", row['text'].split("\n", 1)[0])
print("Top 5 terms: ")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
    print(idx2word[j])

Label:  politics
Text:  Clarke faces ID cards rebellion
Top 5 terms: 
cards
clarke
rebellion
id
bill


In the above result, Text is the headline
and Top 5 frequently appeared terms are displayed.

In [23]:
# Exercise: Use CountVectorizer to form the counts instead

# Exercise (hard): use scipy's csr_matrix instead
# You cannot use X[i, j] += here