In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from time import time

import numpy as np
import pandas as pd


# #############################################################################
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
# Uncomment the following to do the analysis on all the categories
categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

labels = dataset.target
true_k = np.unique(labels).shape[0]

print("Extracting features from the training dataset "
      "using a sparse vectorizer")

vectorizer = TfidfVectorizer(stop_words='english', use_idf=True)
X = vectorizer.fit_transform(dataset.data)
print(vectorizer.idf_)
for i in range(10):
    df = pd.DataFrame(X[i].T.todense(), index=vectorizer.get_feature_names(), columns=['idf'])
    print(df.sort_values(by=['idf'], ascending=False)[:2])


Loading 20 newsgroups dataset for categories:
None
18846 documents
20 categories

Extracting features from the training dataset using a sparse vectorizer
[ 4.22537005  4.30596921  8.35920238 ... 10.15096185  9.45781467
 10.15096185]
           idf
pens  0.552562
jagr  0.220593
               idf
vlb       0.344905
mblawson  0.275926
               idf
hilmi     0.291648
elchibey  0.240976
          idf
bus  0.399065
dma  0.394777
                  idf
jasmine      0.319158
inexpensive  0.241459
            idf
myers  0.312964
unc    0.276462
             idf
tamuts  0.404392
tamu    0.319555
           idf
ists  0.361157
stpl  0.270868
              idf
ists     0.349628
buffalo  0.255924
               idf
arromdee  0.405885
turkey    0.345344
