# Sample Notebook

This notebook implements scikit-learn's [Clustering text documents using k-means](https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html) tutorial in order to demonstrate program slicing with Python. The tutorial was written by Peter Prettenhofer and shared under a BSD 3 clause license. 


**Important** This code cell needs to be run before any of the others, else nothing will work! Thanks to Stack Overflow user Andrei Iatsuk [this solution](https://stackoverflow.com/a/60658965).

In [1]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def write_and_run(line, cell):
    argz = line.split()
    file = argz[-1]
    mode = 'w'
    if len(argz) == 2 and argz[0] == '-a':
        mode = 'a'
    with open(file, mode) as f:
        f.write(cell)
    get_ipython().run_cell(cell)

Each time a cell is executed, the source code it contains is written to ```execution_log.py``` for analysis. Execute the first cell below to start over with an empty execution log. 

In [2]:
%%write_and_run execution_log.py
###########
# File execution log
###########

In [15]:
%%write_and_run -a execution_log.py

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np

In [4]:
%%write_and_run -a execution_log.py

# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

print("Loading 20 newsgroups dataset for categories:")
print(categories)
print()

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']



In [5]:
%%write_and_run -a execution_log.py

dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

3387 documents
4 categories



In [7]:
%%write_and_run -a execution_log.py

labels = dataset.target
true_k = np.unique(labels).shape[0]

print("Extracting features from the training dataset "
      "using a sparse vectorizer")
print()

Extracting features from the training dataset using a sparse vectorizer



In [13]:
%%write_and_run -a execution_log.py

vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                             min_df=2, stop_words='english',
                             use_idf=True)

X = vectorizer.fit_transform(dataset.data)

print("n_samples: %d, n_features: %d" % X.shape)
print()

n_samples: 3387, n_features: 10000



In [14]:
%%write_and_run -a execution_log.py

print("Performing dimensionality reduction using LSA")

# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(100)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

print()

Performing dimensionality reduction using LSA
Explained variance of the SVD step: 24%



In [16]:
%%write_and_run -a execution_log.py

# Cluster with MiniBatchKMeans
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                     init_size=1000, batch_size=1000, verbose=False)

In [22]:
%%write_and_run -a execution_log.py

# Cluster with regular KMeans
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=False)

In [20]:
km.fit(X)
print()




In [21]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

Homogeneity: 0.600
Completeness: 0.598
V-measure: 0.599
Adjusted Rand-Index: 0.606
Silhouette Coefficient: 0.038



In [None]:
%%write_and_run -a execution_log.py

# Inverse transform from SVD
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
print()

In [13]:
%%write_and_run -a execution_log.py

# No SVD
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

In [14]:
%%write_and_run -a execution_log.py

print("Top terms per cluster:")

terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:


AttributeError: 'Pipeline' object has no attribute 'get_feature_names_out'

In [23]:
%whos

Variable              Type               Data/Info
--------------------------------------------------
HashingVectorizer     type               <class 'sklearn.feature_e<...>.text.HashingVectorizer'>
KMeans                type               <class 'sklearn.cluster._kmeans.KMeans'>
MiniBatchKMeans       type               <class 'sklearn.cluster._kmeans.MiniBatchKMeans'>
Normalizer            type               <class 'sklearn.preprocessing._data.Normalizer'>
TfidfTransformer      type               <class 'sklearn.feature_e<...>n.text.TfidfTransformer'>
TfidfVectorizer       type               <class 'sklearn.feature_e<...>on.text.TfidfVectorizer'>
TruncatedSVD          type               <class 'sklearn.decomposi<...>ncated_svd.TruncatedSVD'>
X                     ndarray            3387x100: 338700 elems, type `float64`, 2709600 bytes (2.584075927734375 Mb)
categories            list               n=4
dataset               Bunch              {'data': ['From: healta@s<...>tion_20newsgr