https://umap-learn.readthedocs.io/en/latest/document_embedding.html

In [None]:
import pandas as pd
import umap
import umap.plot

# Used to get the data
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Some plotting libraries
# import matplotlib.pyplot as plt
# %matplotlib notebook
# from bokeh.plotting import show, save, output_notebook, output_file
# from bokeh.resources import INLINE
# output_notebook(resources=INLINE)


In [None]:
%%time
dataset = fetch_20newsgroups(subset='all',
                             shuffle=True, random_state=42)

In [None]:
print(f'{len(dataset.data)} documents')
print(f'{len(dataset.target_names)} categories')

In [None]:
dataset.data[:1]

In [None]:
dataset.target_names

In [None]:
for idx, document in enumerate(dataset.data[:3]):
    category = dataset.target_names[dataset.target[idx]]

    print(f'Category: {category}')
    print('---------------------------')
    # Print the first 500 characters of the post
    print(document[:500])
    print('---------------------------')

In [None]:
category_labels = [dataset.target_names[x] for x in dataset.target]
hover_df = pd.DataFrame(category_labels, columns=['category'])


In [None]:
vectorizer = CountVectorizer(min_df=5, stop_words='english')
word_doc_matrix = vectorizer.fit_transform(dataset.data)


In [None]:
word_doc_matrix

In [None]:
%%time
embedding = umap.UMAP(n_components=2, metric='hellinger').fit(word_doc_matrix)

In [None]:
embedding.embedding_.shape

In [None]:
f = umap.plot.interactive(embedding, labels=dataset.target, hover_data=hover_df, point_size=1)
show(f)