# UMAP Plot

Use the bodies of text that describes a feed to create a UMAP Plot.

## Load Features

Load all information for all feeds or posts in the database that have features.

Store the results in the *feeds* or *posts* dicts.

In [None]:
import sqlite3
import umap, umap.plot
from sklearn.feature_extraction.text import CountVectorizer
from corextopic import corextopic as ct
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import scipy

# stop words lists
from nltk.corpus import stopwords
sw_nltk = stopwords.words('english')
import spacy
en = spacy.load('en_core_web_sm')
sw_spacy = en.Defaults.stop_words
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# stores the posts in a dict
feeds = dict()

print('Loading feed info...')

# connect to the db
conn = sqlite3.connect('feeds.db')
c = conn.cursor()
    
# select only the feeds for which the body of text 
# has already been generated
c.execute('SELECT url, text, title, description FROM feeds WHERE text IS NOT NULL AND title IS NOT NULL AND description IS NOT NULL;')
for entry in c.fetchall():
    feeds[entry[0]] = {
        'text': entry[1],
        'title': entry[2],
        'description': entry[3]
    }        

print('Loaded info for ' + str(len(feeds)) + ' feeds')

In [None]:
lengths = [len(f['text'].split()) for f in feeds.values()]

print('Minimum length of text:', min(lengths))
print('Average length of text:', np.average(lengths))
print('Maximum length of text:', max(lengths))


## Vectorize Documents

### Custom Pre-processor

Define custom pre-processor for the vectorizers.

In [None]:
# define a function that will be used to pre-process data before being vectorized
def custom_preprocessor(text):

    # convert to lower case
    text = text.lower()

    # remove numbers
    text = re.sub(r'\d+', '', text)
    
    # lemmatize text using WordNet
    lemmatizer = WordNetLemmatizer()

    lemmatized_words = list()
    for word in word_tokenize(text):

        # determine the part-of-speech tag for the word
        pos = nltk.pos_tag([word])[0][1][0] # only the first letter matters (hence the 0 at the end)

        # default to noun if the tag does not fall into the categories accepted by lemmatize()
        if pos != 'V' and pos != 'N' and pos != 'R' and pos != 'J':
            pos = 'N'
        if pos == 'J':
            pos = 'A'
        pos = pos.lower()

        # lemmatize the word and add to list
        lemmatized_words.append(lemmatizer.lemmatize(word, pos))

    # join the lemmatized words together
    text = ' '.join(lemmatized_words)   

    return text 

### Binary Matrix

Generate a doc-word matrix with binary counts for words.

In [None]:
corpus = list(f['text'] for f in feeds.values())

# generate a stop words list by joining multiple predefined lists
stopwords_list = list(set(sw_nltk) | set(sw_spacy) | set(STOPWORDS) | set(ENGLISH_STOP_WORDS))

# custom stop words (was given warning that these are not included in stop_words)
stopwords_list.extend(['doe', 'ha', 'le', 'need', 'sha', 'wa', 'wo'])

vectorizer = CountVectorizer(max_features=20000, preprocessor=custom_preprocessor, binary=True, stop_words=stopwords_list)
doc_word = vectorizer.fit_transform(corpus)

# get the words (column labels)
words = vectorizer.get_feature_names()

# get the docs/feeds (keys of the feeds dict)
docs = list(feeds.keys())

print('Shape of doc-word matrix: ', str(doc_word.shape))
print('Number of documents (posts): ', str(len(docs)))
print('Number of words (features): ', str(len(words)))

# save the binary vectorized data
scipy.sparse.save_npz('binary_matrix', doc_word)
np.save('binary_words', np.asarray(words))

### TF-IDF Matrix

Generate a doc-word matrix with TF-IDF values for words.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfIdfVectorizer = TfidfVectorizer(max_features=20000, preprocessor=custom_preprocessor, stop_words=stopwords_list)
doc_word_tfidf = tfIdfVectorizer.fit_transform(corpus)

# get the words (column labels)
words_tfidf = tfIdfVectorizer.get_feature_names()

# save the binary vectorized data
scipy.sparse.save_npz('tfidf_matrix', doc_word_tfidf)
np.save('tfidf_words', np.asarray(words_tfidf))

### Load Matrices

Load the vectorized data from files.

In [None]:
# get the docs/feeds (keys of the feeds dict)
docs = list(feeds.keys())

# binary matrix
doc_word = scipy.sparse.load_npz('binary_matrix.npz')
words =  np.load('binary_words.npy')

# tfidf matrix
doc_word_tfidf = scipy.sparse.load_npz('tfidf_matrix.npz')
words_tfidf = np.load('tfidf_words.npy')

In [None]:
small_words = []
for word in words:
    if len(word) <= 3:
        small_words.append(word)

print(len(small_words))

In [None]:
print(words)

## CorEx Topic Modelling

Use the CorEx library to infer topics from the bodies of text of the feeds.

### Train the Model

In [None]:
# train the CorEx topic model
topic_model = ct.Corex(n_hidden=25, words=words, docs=docs, max_iter=200, verbose=False, seed=1)
topic_model.fit(doc_word, words=words, docs=docs)

### Plot the Distribution of TCs For Each Topic

Use the plot to select an appropriate number of topics. Keep adding topics until additional ones do not significantly contribute to the overall TC.

In [None]:
print('Total Correlation of the model:', str(topic_model.tc))

plt.figure(figsize=(10,5))
plt.bar(range(topic_model.tcs.shape[0]), topic_model.tcs, color='#4e79a7', width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16)
plt.show()

### Display All Topics Generated

In [None]:
topics = topic_model.get_topics(n_words=10)
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

In [None]:
import corextopic.vis_topic as vt

tm_layer2 = ct.Corex(n_hidden=25, docs=docs, max_iter=200, verbose=False, seed=1)
tm_layer2.fit(topic_model.labels, docs=docs)



### Get the Topic and Top Feeds for Random Word

In [None]:
# Get the index of the word in the vocabulary
word_index = list(words).index('motherhood')

# Get the topic associated with that word
word_topic = topic_model.clusters[word_index]

# Get top 10 documents for the topic
top_docs = topic_model.get_top_docs(topic=word_topic, n_docs=15, sort_by='log_prob')

print('Topic', word_topic)
for doc in top_docs:
    print(doc)

### Reduce Using PCA

In [None]:
print(topic_model.p_y_given_x.shape)

In [None]:
from sklearn.decomposition import PCA
import math

pca = PCA(n_components=3)
rgb = pca.fit_transform(topic_model.p_y_given_x)

rgb = rgb - np.min(rgb, axis=0)
rgb = rgb / np.max(rgb, axis=0)
rgb = rgb * 255

hex = []
for rgb_color in rgb:
    
    r = int(rgb_color[0])
    g = int(rgb_color[1])
    b = int(rgb_color[2])

    hex.append('#%02x%02x%02x' % (r, g, b))

### Assign a Topic to Each Feed

For each feed, assign it the topic such that the feed has the highest probability of belonging to that topic.

In [None]:
# Make a hard assignment of one topic per document
hard_labels = np.zeros(doc_word.shape[0])

for i in range(hard_labels.shape[0]):
    hard_labels[i] = np.argmax(topic_model.p_y_given_x[i])

## UMAP Plots

### Topic Probabilities

Create an interactive UMAP plot based on the probabilities that each feed belongs to a topic. 

In [None]:
# fit UMAP model
mapper = umap.UMAP().fit(topic_model.p_y_given_x)

In [None]:
# data to be displayed when hovering over a point in the interactive plot
hover_data = pd.DataFrame({
    'title': [f['title'] for f in list(feeds.values())],
    'description': [f['description'] for f in list(feeds.values())],
    'label': hard_labels
})

umap.plot.output_notebook()
p = umap.plot.interactive(mapper, labels=hard_labels, hover_data=hover_data, point_size=4)
umap.plot.show(p)

### TF-IDF

Use TF-IDF values without topic modelling in the UMAP plot.

In [None]:
# fit umap with tf-idf doc word matrix (without topic modelling)
tfidf_raw_mapper = umap.UMAP().fit(doc_word_tfidf)

In [None]:
hover_data = pd.DataFrame({
    'title': [f['title'] for f in feeds.values()],
    'description': [f['description'] for f in feeds.values()],
})

labels = [f['title'] for f in feeds.values()]

umap.plot.output_notebook()
p = umap.plot.interactive(tfidf_raw_mapper, hover_data=hover_data, point_size=4)
umap.plot.show(p)