In [None]:
# %matplotlib inline
import matplotlib
import seaborn as sns
import onlineldavb
matplotlib.rcParams['savefig.dpi'] = 2 * matplotlib.rcParams['savefig.dpi']

import simplejson
import sys
import requests
from requests_oauthlib import OAuth1
from collections import Counter
import heapq
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from itertools import islice, chain
import numpy as np
import scipy as sp
from bokeh import charts, plotting
import pandas as pd
import matplotlib.pylab as plt

def batch(iterable, size):
    sourceiter = iter(iterable)
    while True:
        batchiter = islice(sourceiter, size)
        yield chain([batchiter.next()], batchiter)
        
def nlargest(n, word_scores):
    return heapq.nlargest(n, word_scores, key=lambda x: x[1])

plotting.output_notebook()

In [None]:
with open("twitter_secrets.json.nogit") as fh:
    secrets = simplejson.loads(fh.read())

auth = OAuth1(
    secrets["api_key"],
    secrets["api_secret"],
    secrets["access_token"],
    secrets["access_token_secret"]
)

def tweet_generator():
    stream = requests.post('https://stream.twitter.com/1.1/statuses/filter.json',
                         auth=auth,
                         stream=True,
                         data={"locations" : "-125.00,24.94,-66.93,49.59"})
    
    for line in stream.iter_lines():
        # filter out keep-alive new lines
        if not line:
            continue
        tweet = simplejson.loads(line)
        if 'text' in tweet:
            yield tweet['text']

In [None]:
stop = set(stopwords.words('english'))

with open("dictnostops.txt") as fh:
    words = [line.strip() for line in fh.readlines()]
    word_to_index = { word: k for k, word in enumerate(words) }

In [None]:
DISPLAY_EVERY = 20

tweets = 0
counter = Counter()
for tweet in tweet_generator():
    for word in tweet.lower().split():
        if word not in stop:
            counter[word] += 1
    tweets += 1
    if tweets % DISPLAY_EVERY == (DISPLAY_EVERY - 1):
        sys.stdout.write("\r" + str(nlargest(10, counter.items())))

In [None]:
BATCH_SIZE = 20
CLUSTER_SIZE = 2

cluster = MiniBatchKMeans(
    n_clusters=CLUSTER_SIZE,
)

for tweets in batch(tweet_generator(), BATCH_SIZE):
    mat = sp.sparse.dok_matrix((BATCH_SIZE, len(words)))
    for row, tweet in enumerate(tweets):
        for word in tweet.lower().split():
            if word in word_to_index:
                mat[row, word_to_index[word]] = 1.
    cluster.partial_fit(mat.tocsr())
    result = [
        nlargest(5, zip(words, cluster.cluster_centers_[i]))
        for i in xrange(cluster.n_clusters)
    ]
    sys.stdout.write("\r" + str(result))

In [None]:
K = 2
D = 1e9
BATCH_SIZE = 20
olda = onlineldavb.OnlineLDA(words, K, D, 1./K, 1./K, 1024., 0.7)

for tweets in batch(tweet_generator(), BATCH_SIZE):
    olda.update_lambda(list(tweets))
    sys.stdout.write("\r" + str(olda.topic_words(5)))