In [386]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

File ‘bbc_text_cls.csv’ already there; not retrieving.



In [387]:
import pandas as pd
import numpy as np
import nltk
import math
from nltk import word_tokenize
from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.preprocessing import normalize

In [388]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [389]:
df = pd.read_csv('bbc_text_cls.csv')

In [390]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [391]:
df['text'].shape

(2225,)

In [392]:
def IDF(corpus, unique_words):
  idf_dict = {}
  count = {}
  N = len(corpus)
  for sen in corpus:
    words = word_tokenize(sen.lower())
    senSet = set()
    for word in words:
      if word not in senSet:
        count[word] = count.get(word,0) + 1
        senSet.add(word)
  for word in count.keys():
    idf_dict[word]=np.log(N/count[word])
  return idf_dict

In [393]:
def fit(whole_data):
    unique_words = set()
    for x in whole_data:
      words = word_tokenize(x.lower())
      for y in words:
        unique_words.add(y)
    vocab = {j:i for i,j in enumerate(list(unique_words))}
    Idf_values_of_all_unique_words=IDF(whole_data,unique_words)
    return vocab, Idf_values_of_all_unique_words
Vocabulary, idf_of_vocabulary=fit(df['text'])

In [394]:
def transform(dataset,vocabulary,idf_values):
  sparse_matrix = csr_matrix( (len(dataset), len(vocabulary)), dtype=np.float64)
  for row in range(len(dataset)):
    words = word_tokenize(dataset[row].lower())
    number_of_words_in_sentence = Counter(words)
    N = len(words)
    words = set(words)
    for word in words:
      tf_idf_value = number_of_words_in_sentence[word]*idf_values[word]
      sparse_matrix[row,vocabulary[word]] = tf_idf_value

  output = normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
  return output

final_output = transform(df['text'],Vocabulary,idf_of_vocabulary)

  self._set_intXint(row, col, x.flat[0])


In [395]:
# reverse mapping
# if you do it smarter you can store it as a list
idx2word = {v:k for k, v in Vocabulary.items()}

In [396]:
# number of documents
N = len(df['text'])

In [397]:
np.random.seed(123)

In [398]:
# pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)

row = df.iloc[i]
print("Label:", row['labels'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = final_output[i].toarray().flatten()
indices = (-scores).argsort()

for j in indices[:5]:
  print(idx2word[j])

Label: sport
Text: Athens memories soar above lows
Top 5 terms:
paula
athens
1500m
her
kelly


In [399]:
# Exercise: use CountVectorizer to form the counts instead

# Exercise (hard): use Scipy's csr_matrix instead
# You cannot use X[i, j] += 1 here