In [1]:
import pandas as pd
import s3fs
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from bokeh.io import output_notebook, show
from bokeh.models import Plot, Range1d, MultiLine, Circle, HoverTool, BoxZoomTool, ResetTool
from bokeh.models.graphs import from_networkx
from bokeh.palettes import Spectral4
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
%matplotlib inline

In [3]:
output_notebook()

In [4]:
SUBS = {
    "politics"
}

In [5]:
fs = s3fs.S3FileSystem()

path = fs.glob('s3://zigdata.org/reddit-headlines/*')[-1]
titles = pd.read_parquet(f"s3://{path}")
titles['created'] = pd.to_datetime(titles.created_epoch, unit='s')
titles['extracted'] = pd.to_datetime(titles.extracted_epoch, unit='s')

In [6]:
titles['words'] = titles.title.str.split(' ').apply(len)
titles.groupby('subreddit').words.mean().sort_values(ascending=True).head()

subreddit
me_irl        1.011236
nsfw          3.068966
creepy        4.086957
Unexpected    4.492323
NSFW_GIF      4.505820
Name: words, dtype: float64

In [7]:
filtered = titles.loc[titles.subreddit.isin(SUBS)]

In [8]:
vectorizer = CountVectorizer(stop_words="english", max_features=100)
X = vectorizer.fit_transform(filtered.title)
Xc = (X.T * X)
Xc.setdiag(0)
matrix_dense = Xc.todense()
G = nx.from_numpy_matrix(matrix_dense)

In [9]:
words = pd.DataFrame(vectorizer.get_feature_names(), columns=['word'])
words['titles'] = np.squeeze(np.array(X.sum(axis=0)).ravel())
words['size'] = (5 + words.titles / words.titles.quantile(0.2)).clip(5, 20)
nx.set_node_attributes(G, words.word,  "word")
nx.set_node_attributes(G, words.titles, "titles")
nx.set_node_attributes(G, words['size'],  "size")

In [10]:
plot = Plot(plot_width=800, plot_height=800,
            x_range=Range1d(-1.1, 1.1), y_range=Range1d(-1.1, 1.1))
node_hover_tool = HoverTool(tooltips=[("index", "@index"), ("word", "@word"), ("titles", "@titles"), ("size", "@size")])
plot.add_tools(node_hover_tool, BoxZoomTool(), ResetTool())

graph_renderer = from_networkx(G, nx.spring_layout, scale=1, center=(0, 0))

graph_renderer.node_renderer.glyph = Circle(size="size", fill_color=Spectral4[0])
graph_renderer.edge_renderer.glyph = MultiLine(line_color="black", line_alpha=0.8, line_width=0.25)

plot.renderers.append(graph_renderer)

In [11]:
def build_documents(corpus, stop_words="english", max_words=1000, tfidf=False):
    if tfidf:
        vectorizer = TfidfVectorizer
    else:
        vectorizer = CountVectorizer
    vectorizer = vectorizer(stop_words=stop_words, max_features=max_words)
    docs = vectorizer.fit_transform(corpus)
    return {'vectorizer': vectorizer, 'docs': docs, 'features': vectorizer.get_feature_names()}

def count_cooccurrences(docs):
    coocs = docs.T * docs
    coocs.setdiag(0)
    return coocs

def token_counts(docs):
    features = docs.get('features')
    docs = docs.get('docs')
    return pd.DataFrame({'token': features, 'count': np.squeeze(np.asarray(docs.sum(axis=0)))})

def _index_to_records(co, idx):
    words = co.indices[co.indptr[idx]: co.indptr[idx + 1]]
    scores = co.data[co.indptr[idx]: co.indptr[idx + 1]]
    return words, scores

def sparse_cooccurrence_to_df(sp_cooc, features):
    lookup = dict(zip(range(len(features)), features))
    rows = {'token': [], 'coword': [], 'score': []}

    for index, word in enumerate(features):
        words, scores = _index_to_records(sp_cooc, index)
        rows['token'].extend([word] * len(words))
        rows['coword'].extend(words)
        rows['score'].extend(scores)
    df = pd.DataFrame(rows)
    df['coword'] = df['coword'].map(lookup)
    # to speed up lookup/filtering times
    df['token'] = df.token.astype('category')
    df['coword'] = df.coword.astype('category')
    return df

def vectorized_contingency(df, total, count_map):
    df['yb'] = df.coword.map(count_map)
    df['ya'] = df.token.map(count_map)
    df['nb'] = total - df.yb
    df['na'] = total - df.ya
    df['yn'] = df.ya - df.score
    df['ny'] = df.yb - df.score
    df['nn'] = total - df.ny - df.yn - df.score
    return df


def chisq(row):
    contingency = np.array([[row.score, row.yn], [row.ny, row.nn]])
    chi2, p, dof, ex = stats.chi2_contingency(contingency)
    return chi2, p


def add_significance_columns(df, total, count_lookup):
    df = vectorized_contingency(df, total, count_lookup)
    df = df.loc[(df._get_numeric_data() > 0).all(1)].reset_index(drop=True)
    _df = df.apply(lambda row: chisq(row), axis=1).tolist()
    _df = pd.DataFrame(_df, index=df.index, columns=['chi2', 'p'])
    df['chi2'] = _df['chi2']
    df['p'] = _df['p']
    return df

In [24]:
docs = build_documents(filtered.title)
counts = token_counts(docs)
sp_coocs = count_cooccurrences(docs['docs'])
coocs = sparse_cooccurrence_to_df(sp_coocs, docs['features'])

In [35]:
MIN_SIZE = 10
MAX_NORMALIZED_BOOST = 10
NCLUS = 8
layout_cache = {}

def update_clusters(graph, nclus):
    # now cluster
    if nclus is not None:
        graph = largest_connected(graph)
        mappings, cluster_sizes = network_to_cluster(graph, nclus)
    else:
        mappings = get_color(graph)
        cluster_sizes = pd.DataFrame(columns=['cluster', 'cluster_size'])

    for key, mapping in mappings.items():
        nx.set_node_attributes(graph, mapping, key)
    return graph, cluster_sizes

def build_graph_from_allowed(all_nodes, edges, allowed_nodes, nclus):
    G = nx.Graph()

    for index, row in all_nodes.iterrows():
        if row.token in allowed_nodes:
            G.add_node(row.token, count=row['count'], size=row['size'])

    for index, row in edges.iterrows():
        G.add_edge(row.token, row.coword, weight=row.score)
    subgraph, cluster_sizes = update_clusters(G, nclus)
    return G, subgraph, cluster_sizes

def filter_below_min(coocs, _min=0):
    if _min > 0:
        threshold = coocs.score.quantile(_min)
        mask = coocs.score >= threshold
        return coocs.loc[mask].reset_index(drop=True)
    else:
        return coocs
    
def build(counts, coocs):
    counts['size'] = MIN_SIZE + (MAX_NORMALIZED_BOOST * counts['count'] / counts['count'].quantile(0.90))
    counts['size'] = counts['size'].clip(MIN_SIZE, MAX_NORMALIZED_BOOST + MIN_SIZE)
    count_lookup = counts.set_index('token')['count'].to_dict()
    coocs = filter_below_min(coocs, _min=0.9)
    coocs = add_significance_columns(coocs, len(docs), count_lookup)
    filtered_data = filter_insignificant(coocs, counts)
    valid_nodes = get_valid_nodes(filtered_data)
    nodes = counts.loc[counts.token.isin(valid_nodes)].reset_index(drop=True)
    raw_graph, base_subgraph, cluster_sizes = build_graph_from_allowed(nodes, filtered_data, valid_nodes,
                                                                       nclus=NCLUS)
    base_layout = compute_layout(base_subgraph, layout_type='spring')
    layout_cache[0] = base_layout
    return raw_graph, base_subgraph, base_layout, cluster_sizes

In [36]:
raw_graph, base_subgraph, base_layout, cluster_sizes = build(counts, coocs)

AttributeError: 'DataFrame' object has no attribute 'tolist'

In [19]:
show(plot)