In [1]:
import pickle
import pandas as pd
import spacy

nlp = spacy.load('en')

import re
import pyLDAvis
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem import WordNetLemmatizer 
import numpy as np
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
STOPLIST = set(list(ENGLISH_STOP_WORDS) + ["n't","'s","'m","ca","'","'re","pron"])

In [2]:
with open('../data/clean_tweets.pkl', 'rb') as picklefile:
    df = pickle.load(picklefile)

In [3]:
text = df.sample(100000).text.values

In [4]:
def get_tokens(text):  
    """Return lemmatized tokens."""
    text = re.sub(r"http\S+", "", text) 
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    words = ' '.join(letters_only.lower().split())
    try:
        tokens = [token.lemma_ for token in nlp(words)] 
    except:
        tokens = [token.lemma_ for token in nlp(words.decode('utf8'))] 
    filtered = [t for t in tokens if t != '' and t != ' ' and t != '\n' and t != '\n\n']
    return ' '.join(filtered)

In [5]:
tokens = [get_tokens(tweet) for tweet in text]

In [6]:
def prep_pylda(docs, n_components = 5):   
    vect = TfidfVectorizer(max_df = 0.5, max_features = 10000,
                                 min_df = 5, stop_words = STOPLIST,
                                 use_idf = True, tokenizer = None, ngram_range=(1, 3))
    matrix = vect.fit_transform(docs)
    vocab = vect.get_feature_names()
    
    # fit transform lda
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                learning_method='online', learning_offset=50.,
                random_state=0, doc_topic_prior = .001)
    doc_topic_dists = lda.fit_transform(matrix)
    
    # prepare pyLDAvis stuff 
    get_normed = lambda data: pd.DataFrame(data).div(data.sum(axis = 1), axis = 0)
    doc_lengths = [len(tok) for tok in docs]
    prepared = pyLDAvis.prepare(
            doc_lengths = doc_lengths,
            vocab = vocab,
            term_frequency = np.asarray(matrix.sum(axis = 0)).ravel().tolist(),
            topic_term_dists = get_normed(lda.components_),  
            doc_topic_dists = get_normed(doc_topic_dists)) 
    
    return prepared

In [9]:
prepared = prep_pylda(tokens, n_components = 20)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [10]:
# display in notebook
pyLDAvis.display(prepared)