In [1]:
!pip install nltk
!pip install gensim
!pip install pyLDAvis

Collecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/a5/3a/af82e070a8a96e13217c8f362f9a73e82d61ac8fff3a2561946a97f96266/pyLDAvis-2.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 2.7MB/s 
Collecting funcy
[?25l  Downloading https://files.pythonhosted.org/packages/ce/4b/6ffa76544e46614123de31574ad95758c421aae391a1764921b8a81e1eae/funcy-1.14.tar.gz (548kB)
[K     |████████████████████████████████| 552kB 22.2MB/s 
Building wheels for collected packages: pyLDAvis, funcy
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97711 sha256=7276290d94f9a650a56837b73aef59c14225105f7b8c0354abedb3c67b8711ae
  Stored in directory: /root/.cache/pip/wheels/98/71/24/513a99e58bb6b8465bae4d2d5e9dba8f0bef8179e3051ac414
  Building wheel for funcy (setup.py) ... [?25l[?25hdone
  Created wheel for funcy: filename=funcy-1.14-py2.py3-none-any.whl size=32042 sha256=5a6a3430

In [0]:
#searched bitcoin on NYT
dataset = ['The Coder and the Dictator',
           'Bitcoin Has Lost Steam. But Criminals Still Love It.',
           'China’s Cryptocurrency Plan Has a Powerful Partner: Big Brother',
           'China Gives Digital Currencies a Reprieve as Beijing Warms to Blockchain',
           'Bitcoin is a protocol. Bitcoin is a brand.']

In [3]:

import spacy
spacy.load('en')
from spacy.lang.en import English
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora
import pickle
import gensim


parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
tokenize('Holy guacamole')

['holy', 'guacamole']

In [0]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [6]:
print(get_lemma('notebooks'))
print(get_lemma2('notebooks'))

notebook
notebook


In [7]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [8]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
list(en_stop)[:5]

['against', 'an', 'off', "weren't", "isn't"]

In [0]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4] #discard short words
    tokens = [token for token in tokens if token not in en_stop] #remove if stop word
    tokens = [get_lemma(token) for token in tokens] #lemmatize each word
    return tokens

In [11]:
prepare_text_for_lda('Harry Potter is a silly little wizard')

['harry', 'potter', 'silly', 'little', 'wizard']

In [0]:
text_data = [prepare_text_for_lda(i) for i in dataset]

In [14]:
import random
text_data2 = []
with open('dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            #print(tokens)
            text_data2.append(tokens)
text_data2[:5]

[['probability', 'distribution', 'blackout', 'complex', 'power', 'network'],
 ['base',
  'design',
  'scale',
  'offset',
  'base',
  'decoding',
  'algorithm',
  'rayleigh',
  'fading',
  'channel'],
 ['temporal', 'management'],
 ['unify', 'logging', 'infrastructure', 'analytics', 'twitter'],
 ['optimality', 'scalability', 'lattice', 'histogram', 'construction']]

In [0]:
# creates dictionary generator
dictionary = corpora.Dictionary(text_data2)


In [17]:
# creates a list of lists of tuples, with index for each word in bag of words
corpus = [dictionary.doc2bow(text) for text in text_data2]
corpus[:2]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
 [(6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]]

In [18]:
#creates a pickle file and dictionary file to save progress
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [19]:
#LDA Model instantiation , corpus is the list of tuples, dictionary maps the words to indices

NUM_TOPICS = 5 # arbitrary
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [20]:
# Choosing 4 words from each title, what are the central topics

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.053*"base" + 0.028*"using" + 0.028*"scale" + 0.027*"network"')
(1, '0.021*"sensor" + 0.021*"complex" + 0.021*"documentary" + 0.021*"telling"')
(2, '0.027*"construction" + 0.015*"flexible" + 0.015*"sheaf" + 0.015*"paper"')
(3, '0.034*"database" + 0.034*"approach" + 0.034*"group" + 0.018*"system"')
(4, '0.044*"network" + 0.030*"power" + 0.016*"efficient" + 0.016*"algorithm"')


In [21]:
dataset[-1]

'Bitcoin is a protocol. Bitcoin is a brand.'

In [22]:
new_doc = dataset[-1]
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]


In [23]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.029*"network" + 0.016*"using" + 0.016*"sensor" + 0.016*"approach"')
(1, '0.040*"base" + 0.018*"algorithm" + 0.018*"scale" + 0.018*"power"')
(2, '0.018*"delta" + 0.018*"constant" + 0.018*"inductor" + 0.018*"propose"')


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [24]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.036*"construction" + 0.036*"scale" + 0.036*"base" + 0.036*"scalability"')
(1, '0.067*"network" + 0.035*"complex" + 0.035*"using" + 0.035*"efficient"')
(2, '0.051*"datapath" + 0.051*"error" + 0.051*"overclocking" + 0.051*"tradeoff"')
(3, '0.056*"sensor" + 0.029*"wireless" + 0.029*"story" + 0.029*"telling"')
(4, '0.023*"flexible" + 0.023*"parallel" + 0.023*"tangible" + 0.023*"sheet"')
(5, '0.042*"base" + 0.042*"network" + 0.022*"design" + 0.022*"decoding"')
(6, '0.025*"using" + 0.025*"directional" + 0.025*"coding" + 0.025*"broadcast"')
(7, '0.054*"group" + 0.054*"efficiency" + 0.054*"semantics" + 0.054*"recommendation"')
(8, '0.025*"efficient" + 0.025*"algorithm" + 0.025*"approach" + 0.025*"topology"')
(9, '0.039*"speech" + 0.039*"inner" + 0.039*"cepstral" + 0.039*"noise"')


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [25]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [26]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)
