In [160]:
import sys
import numpy as np
import pandas as pd
import seaborn as sbs
import json
sys.path.append("./lib")
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [161]:
with open("data/pinboard.json") as file:
    data = json.load(file)

In [162]:
tagged_data = [ entry for entry in data if len(entry["tags"]) > 0 and entry["tags"] != "pocket-read" and entry["tags"] != "pocket-unread"]

import itertools
import operator
tags = [ entry["tags"] for entry in tagged_data]
all_tags = list(itertools.chain.from_iterable(tag.split(" ") for tag in tags))
tag_counts = {}
for tag in all_tags:
    if tag not in tag_counts:
        tag_counts[tag] = 0
    tag_counts[tag] += 1
sorted_tag_counts = sorted(tag_counts.items(), key=operator.itemgetter(1), reverse=True)
sorted_tag_counts

[('programming', 1357),
 ('politics', 780),
 ('computerscience', 660),
 ('development', 616),
 ('culture', 416),
 ('functional', 394),
 ('web', 392),
 ('ruby', 345),
 ('mathematics', 322),
 ('economics', 306),
 ('design', 304),
 ('statistics', 281),
 ('society', 275),
 ('art', 269),
 ('technology', 268),
 ('scala', 256),
 ('machinelearning', 254),
 ('haskell', 253),
 ('data', 241),
 ('software', 235),
 ('music', 202),
 ('internet', 197),
 ('business', 182),
 ('architecture', 151),
 ('learning', 150),
 ('linux', 146),
 ('rails', 138),
 ('tutorial', 134),
 ('engineering', 126),
 ('database', 126),
 ('education', 125),
 ('socialism', 118),
 ('science', 117),
 ('history', 115),
 ('systems', 108),
 ('reference', 101),
 ('testing', 100),
 ('copyright', 98),
 ('javascript', 98),
 ('types', 96),
 ('capitalism', 93),
 ('theory', 91),
 ('criticism', 91),
 ('analysis', 91),
 ('pocket-read', 89),
 ('finance', 85),
 ('language', 81),
 ('sysadmin', 78),
 ('java', 74),
 ('distributed', 73),
 ('python

In [163]:
from sklearn.feature_extraction.text import CountVectorizer

In [164]:
vectorizer = CountVectorizer(stop_words=["pocket-read", "pocket-unread"])

In [165]:
vectorizer.fit(tags)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['pocket-read', 'pocket-unread'], strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [166]:
for entry in tagged_data:
    entry["vector"] = vectorizer.transform([entry["tags"]])

In [167]:
vectors = vectorizer.transform(tags)

In [168]:
vectors

<4210x5737 sparse matrix of type '<class 'numpy.int64'>'
	with 31906 stored elements in Compressed Sparse Row format>

In [169]:
from sklearn.manifold import TSNE

In [170]:
tsne = TSNE()

In [171]:
coords = tsne.fit_transform(vectors.toarray())
for (entry, coord) in zip(tagged_data, coords):
    entry["coords"] = coord

In [230]:
from sklearn.mixture import GaussianMixture
clustering = GaussianMixture(n_components=20)
vectorsA = vectors.toarray()
clustering.fit(vectorsA)
clusters = clustering.predict(vectorsA)
for (cluster, entry) in zip(clusters, tagged_data):
    entry["cluster"]=cluster

In [231]:
from bokeh.plotting import figure, output_notebook, show, ColumnDataSource
from bokeh.models import HoverTool, BoxZoomTool
from bokeh.palettes import Category20
palette = Category20[20]
def plot(data):
    source = ColumnDataSource(
            data=dict(
                x=[entry["coords"][0] for entry in data],
                y=[entry["coords"][1] for entry in data],
                title=[entry["description"] for entry in data],
                tags=[entry["tags"] for entry in data],
                fill_color=[(palette[entry["cluster"]] if "cluster" in entry else palette[0]) for entry in data]
            )
        )

    output_notebook()
    TOOLS = [BoxZoomTool(), HoverTool(tooltips=[("tags:", "@tags"), ("Title", "@title")])]
    p = figure(plot_width=800, plot_height=800, title=None, tools=TOOLS)
    p.circle("x", "y", size=10, source=source, fill_color="fill_color")
    show(p)

In [232]:
plot(tagged_data)

In [233]:
palette

['#1f77b4',
 '#aec7e8',
 '#ff7f0e',
 '#ffbb78',
 '#2ca02c',
 '#98df8a',
 '#d62728',
 '#ff9896',
 '#9467bd',
 '#c5b0d5',
 '#8c564b',
 '#c49c94',
 '#e377c2',
 '#f7b6d2',
 '#7f7f7f',
 '#c7c7c7',
 '#bcbd22',
 '#dbdb8d',
 '#17becf',
 '#9edae5']

In [234]:
clusters = {}
for entry in tagged_data:
    if entry["cluster"] not in clusters:
        clusters[entry["cluster"]] = { "tag_counts": {}, "entries":[]}
    cluster = clusters[entry["cluster"]]
    cluster["entries"].append(entry)
    for tag in entry["tags"].split(" "):
        if tag not in cluster["tag_counts"]:
            cluster["tag_counts"][tag] = 0
        cluster["tag_counts"][tag] += 1

In [235]:
clusters

{0: {'entries': [{'cluster': 0,
    'coords': array([ 9.56163001,  5.51656914]),
    'description': 'Green-tinted glasses - Weekly Worker',
    'extended': '',
    'hash': 'f977b262f96105325e6aa3c4c97abcca',
    'href': 'http://weeklyworker.co.uk/worker/1043/green-tinted-glasses/',
    'meta': '44c3ee555cbe912db4222c88e86cc5c3',
    'shared': 'yes',
    'tags': 'socialism politics greenparty greens uk election',
    'time': '2015-01-29T21:39:14Z',
    'toread': 'no',
    'vector': <1x5737 sparse matrix of type '<class 'numpy.int64'>'
    	with 6 stored elements in Compressed Sparse Row format>},
   {'cluster': 0,
    'coords': array([ 10.44176199,   5.59289268]),
    'description': 'Liberal Democrat Voice: All our parties have more in common than we’d like to admit',
    'extended': 'Some look at any deal between any two parties (and there are a lot of them across the UK) as selling out and betraying principles. I don’t think that’s right.\n\nThe reality is that different parties and p

In [236]:
clusters["tag_counts"]

KeyError: 'tag_counts'

In [245]:
cluster_summary = {}
for id in clusters:
    tag_counts = clusters[id]["tag_counts"]
    entry_count = len(clusters[id]["entries"])
    sorted_tag_counts = sorted(tag_counts.items(), key=operator.itemgetter(1), reverse=True)
    top_tags = sorted_tag_counts[:5]
    cluster_summary[id]= { "entries": entry_count, "top_tags": top_tags }

In [246]:
cluster_summary

{0: {'entries': 18,
  'top_tags': [('politics', 18),
   ('election', 16),
   ('parliament', 14),
   ('uk', 13),
   ('coalition', 8)]},
 1: {'entries': 852,
  'top_tags': [('mathematics', 130),
   ('music', 73),
   ('computerscience', 57),
   ('data', 53),
   ('learning', 50)]},
 2: {'entries': 40,
  'top_tags': [('testing', 39),
   ('programming', 27),
   ('development', 27),
   ('tdd', 24),
   ('bdd', 23)]},
 3: {'entries': 189,
  'top_tags': [('computerscience', 189),
   ('programming', 181),
   ('functional', 172),
   ('haskell', 129),
   ('types', 34)]},
 4: {'entries': 144,
  'top_tags': [('technology', 141),
   ('art', 72),
   ('culture', 60),
   ('music', 23),
   ('internet', 16)]},
 5: {'entries': 350,
  'top_tags': [('development', 345),
   ('programming', 282),
   ('web', 180),
   ('ruby', 155),
   ('rails', 78)]},
 6: {'entries': 132,
  'top_tags': [('programming', 130),
   ('functional', 124),
   ('haskell', 102),
   ('development', 23),
   ('ghc', 16)]},
 7: {'entries': 13

In [247]:
cluster_summary_sorted = sorted(cluster_summary.items(), key=lambda x: x[1]["entries"], reverse=True)

In [248]:
cluster_summary_sorted

[(1,
  {'entries': 852,
   'top_tags': [('mathematics', 130),
    ('music', 73),
    ('computerscience', 57),
    ('data', 53),
    ('learning', 50)]}),
 (8,
  {'entries': 498,
   'top_tags': [('politics', 498),
    ('society', 138),
    ('culture', 123),
    ('socialism', 85),
    ('capitalism', 56)]}),
 (15,
  {'entries': 364,
   'top_tags': [('programming', 363),
    ('computerscience', 168),
    ('ruby', 69),
    ('mathematics', 53),
    ('database', 41)]}),
 (5,
  {'entries': 350,
   'top_tags': [('development', 345),
    ('programming', 282),
    ('web', 180),
    ('ruby', 155),
    ('rails', 78)]}),
 (19,
  {'entries': 273,
   'top_tags': [('economics', 273),
    ('politics', 208),
    ('finance', 45),
    ('business', 44),
    ('society', 36)]}),
 (9,
  {'entries': 224,
   'top_tags': [('machinelearning', 221),
    ('computerscience', 124),
    ('statistics', 84),
    ('programming', 59),
    ('data', 37)]}),
 (17,
  {'entries': 205,
   'top_tags': [('scala', 203),
    ('progra

In [249]:
tagged_data

[{'cluster': 15,
  'coords': array([ 6.78567105, -2.3708237 ]),
  'description': 'Metaphors We Compute By',
  'extended': 'Charles Baker in a letter to Donald Knuth said that to program is to write to another programmer about our solution to a problem.',
  'hash': '81693f6118387b539115aa92651ca5fe',
  'href': 'http://alvaro-videla.com/2017/01/metaphors-we-code-by.html',
  'meta': '0bcb11aebeebd387944a3bc7f1f5b8a3',
  'shared': 'yes',
  'tags': 'programming metaphor storytelling language computerscience culture knowledge communication',
  'time': '2017-02-11T17:16:04Z',
  'toread': 'no',
  'vector': <1x5737 sparse matrix of type '<class 'numpy.int64'>'
  	with 8 stored elements in Compressed Sparse Row format>},
 {'cluster': 8,
  'coords': array([ 7.75531133,  2.43581868]),
  'description': 'Selfwork — Real Life',
  'extended': 'Because artists’ work is not always seen as work, they are accustomed to exposure to potentially exploitative labor conditions and practices. They often know by

In [250]:
outputable = []
for entry in tagged_data:
    outputable.append({
        'cluster': entry["cluster"].item(),
        'description': entry["description"],
        'extended': entry["extended"],
        'href': entry['href'],
        'time': entry['time'],
        'tags': entry["tags"].split(" "),
        "x": entry["coords"][0].item(),
        "y": entry["coords"][1].item(),
    })

In [251]:
with open("data/clustered.json", "w") as file:
    json.dump(outputable, file)

In [252]:
ype(outputable[0]['cluster'].astype(int))

NameError: name 'ype' is not defined

In [253]:
cluster_summary

{0: {'entries': 18,
  'top_tags': [('politics', 18),
   ('election', 16),
   ('parliament', 14),
   ('uk', 13),
   ('coalition', 8)]},
 1: {'entries': 852,
  'top_tags': [('mathematics', 130),
   ('music', 73),
   ('computerscience', 57),
   ('data', 53),
   ('learning', 50)]},
 2: {'entries': 40,
  'top_tags': [('testing', 39),
   ('programming', 27),
   ('development', 27),
   ('tdd', 24),
   ('bdd', 23)]},
 3: {'entries': 189,
  'top_tags': [('computerscience', 189),
   ('programming', 181),
   ('functional', 172),
   ('haskell', 129),
   ('types', 34)]},
 4: {'entries': 144,
  'top_tags': [('technology', 141),
   ('art', 72),
   ('culture', 60),
   ('music', 23),
   ('internet', 16)]},
 5: {'entries': 350,
  'top_tags': [('development', 345),
   ('programming', 282),
   ('web', 180),
   ('ruby', 155),
   ('rails', 78)]},
 6: {'entries': 132,
  'top_tags': [('programming', 130),
   ('functional', 124),
   ('haskell', 102),
   ('development', 23),
   ('ghc', 16)]},
 7: {'entries': 13

In [254]:
with open("data/clusters.json", "w") as file:
    json.dump([cluster_summary[key] for key in cluster_summary], file)