# Building a graph from generated topics.

Following generates json data for generating interactive topic graphs in JavaScript & HTML. The bulk of the code writtern here has been compressed into the file topic_graph for easier use.

In [81]:
import time

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

df = pd.read_csv("../data/twitter/tweets_large_train", na_filter=False)

## Generating topics

Will use data from a previous collection. Scikit-learn's Latent Dirchlet Allocation (LDA) will be used to generate topic's.

In [82]:
n_features = 5000
n_topics = 5
n_samples = 40


tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10,
                                max_features=n_features,
                                stop_words='english')

tf_features = tf_vectorizer.fit_transform(df.text)
tf_feature_names = tf_vectorizer.get_feature_names()

print("Fitting LDA models with tf features, "
      "\nnumber of topics=%d\nn_samples=%d and n_features=%d..."
      % (n_topics, n_samples, n_features))

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
# this will take a while. reduce max_iter to reduce this.
t0 = time.time()
lda.fit(tf_features)
print("time: {:.2f} secs".format(time.time() - t0))

Fitting LDA models with tf features, 
number of topics=5
n_samples=40 and n_features=5000...
time: 686.82 secs


Here are the topics and their scores for reference.

In [84]:
for topic_idx, topic in enumerate(lda.components_):
    arr = []
    print("#### topic:", topic_idx)
    for i in topic.argsort()[:-25 - 1:-1]:
        print("{} {:.2f}".format(
            tf_feature_names[i], topic[i]), end=", ")
    print()

#### topic: 0
citizenship 10840.53, vote 4121.55, illeg 3711.21, immigr 3312.97, trump 2014.66, legal 1855.68, american 1809.00, daca 1588.47, path 1477.78, want 1362.27, democrat 1316.33, citizen 1309.40, ask 1232.94, voter 1189.20, deport 1163.20, million 1162.46, tax 1154.44, dreamer 1141.60, obama 1023.22, presid 975.45, parti 967.29, appli 963.35, russian 903.45, card 902.72, dem 898.76, 
#### topic: 1
citizen 5721.96, state 3449.32, pleas 1948.39, trump 1720.42, today 1688.45, new 1605.87, privat 1576.76, offic 1446.70, general 1439.44, report 1346.30, presid 1311.44, unit 1292.26, attorney 1205.19, lawsuit 1202.77, major 1175.97, target 886.75, repres 840.77, polic 782.24, pull 755.54, join 736.55, district 733.31, press 723.57, associ 702.04, order 694.20, polici 653.45, 
#### topic: 2
citizen 20187.92, gun 8951.24, right 7337.69, everi 4178.07, need 3909.13, weapon 3237.75, peopl 3102.05, protect 2908.98, arm 2716.46, govern 2522.37, american 2498.63, nra 2206.42, elect 2122.0

## How to format the data?

I want this graph to be interactive in a browser so i will export it to json for use with `d3.js`. For this, looking at other peoples examples making graphs with d3.js it seems a common structure is to make two arrays one containing nodes the other containing links.

It is going to be a bipartide graph, so the topic Nodes are connected by term Nodes this can be visualised as follows, where `t` is a term node and `O` is a term node:
            
         O   O   O
         |    \ /
    O -- t     t - O
          \   /     \  
           \ /       \
            O ------- t --- O
                     / |
                    /  |
                   O   O


Here is psuedo struct representing the each node object. The type property will designate wether is a topic node or a term node.

    Node {
        name:  String
        root:  Bool
    }
    
Here is psuedo struct representing the each link object.

    Link {
        source:     Int    
        target:     Int
        weight:     Float
        group:      Int
    }

In [165]:
def build_graph(model, names, n_samples):
    (links, nodes) = ([], [])
    index = {}
    index_n = 0 # keep track of the node indexes.
    for i, topic in enumerate(model.components_):
        topic_root = letters[i]
        nodes.append(Node(topic_root, True))
        index[topic_root] = index_n
        index_n += 1
        for j in topic.argsort()[:-n_samples - 1:-1]:
            term = names[j]
            if term not in index:
                nodes.append(Node(term, False))
                index[term] = max_n
                index_n += 1
            link = Link(index[topic_root], index[term], topic[j], i)
            links.append(link)
    return (nodes, norm_scale(links))
        
def Node(name, root):
    "pusedo Node struct constructor"
    return dict(name=name, root=root)

def Link(source, target, weight, group):
    "pusedo Link struct constructor"
    return dict(source=source, 
                target=target,
                weight=weight,
                group=group)

def norm_scale(x):
    "apply min max normalization"
    X = np.array([v['weight'] for v in x])
    X = 1 + np.log(X) # apply sub-linear scaling
    X = (X - X.min()) / (X.max() - X.min())
    for v, w in zip(x, X):
        v["weight"] = w
    return x

letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

In [166]:
(nodes, links) = build_graph(lda, tf_feature_names, n_samples)

In [160]:
for n in nodes[20:30]:
    print(n)

{'root': False, 'name': 'presid'}
{'root': False, 'name': 'parti'}
{'root': False, 'name': 'appli'}
{'root': False, 'name': 'russian'}
{'root': False, 'name': 'card'}
{'root': False, 'name': 'dem'}
{'root': False, 'name': 'question'}
{'root': False, 'name': 'resid'}
{'root': False, 'name': 'year'}
{'root': False, 'name': 'provid'}


In [161]:
for n in links[20:30]:
    print(n)

{'weight': 0.18311055383608046, 'source': 0, 'target': 21}
{'weight': 0.18201426987995756, 'source': 0, 'target': 22}
{'weight': 0.16475428002390025, 'source': 0, 'target': 23}
{'weight': 0.16453694885339476, 'source': 0, 'target': 24}
{'weight': 0.16335475354630319, 'source': 0, 'target': 25}
{'weight': 0.16191834618213768, 'source': 0, 'target': 26}
{'weight': 0.15762542530571419, 'source': 0, 'target': 27}
{'weight': 0.15623092820349119, 'source': 0, 'target': 28}
{'weight': 0.15251215849975455, 'source': 0, 'target': 29}
{'weight': 0.13812132903575133, 'source': 0, 'target': 30}


In [162]:
m = len(links)
n = len(nodes)

print(m, n)

200 159


In [167]:
import json
out = "../data/word_graph.json"

with open(out, "w") as f:
    json.dump({"nodes": nodes,  "links": links}, 
              f, indent=4, sort_keys=True)
print("saved result to", out)

saved result to ../data/word_graph.json
