### Clustering - LSA and LDA
Algorithms to infer topic distribution across a set of documents.

In [35]:
from gensim import corpora, models, similarities
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import json

In [36]:
corpus = CategorizedPlaintextCorpusReader("corpus/lyrics/tokenized/", r".*\.txt", cat_pattern=r"(\w+)/*", encoding="utf8")

In [37]:
tracks_all = []
for x in range(0, len(corpus.fileids())):
    tracks_all.append(corpus.raw(corpus.fileids()[x]))

In [38]:
texts = [text.split(' ') for text in tracks_all] 

In [39]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, no_above=0.8)

In [40]:
corpus_doc2bow = [dictionary.doc2bow(text) for text in texts]

### Latent semantic analysis
Analyzes relationships between a set of documents and the terms they contain by producing a set of concepts related to the documents and terms. LSA assumes that words that are close in meaning will occur in similar pieces of text.
https://en.wikipedia.org/wiki/Latent_semantic_analysis

In [41]:
%time lsi = models.lsimodel.LsiModel(corpus=corpus_doc2bow, id2word=dictionary, num_topics=50)

CPU times: user 82.3 ms, sys: 7.33 ms, total: 89.7 ms
Wall time: 102 ms


In [42]:
lsi.print_topics(5)

[(0,
  u'0.348*"got" + 0.311*"get" + 0.267*"line" + 0.241*"rock" + 0.239*"well" + 0.230*"island" + 0.205*"time" + 0.155*"good" + 0.153*"road" + 0.152*"said"'),
 (1,
  u'0.349*"cry" + 0.269*"tell" + -0.267*"island" + -0.260*"rock" + -0.246*"line" + 0.209*"time" + -0.168*"ride" + -0.166*"road" + -0.155*"got" + 0.154*"town"'),
 (2,
  u'0.644*"high" + 0.399*"rising" + 0.396*"feet" + 0.309*"water" + 0.144*"two" + 0.123*"five" + 0.123*"mama" + 0.122*"papa" + -0.117*"get" + -0.098*"cry"'),
 (3,
  u'-0.600*"get" + -0.276*"blues" + 0.276*"cry" + -0.272*"rhythm" + 0.268*"tell" + 0.135*"guns" + -0.126*"time" + 0.121*"line" + 0.116*"island" + 0.110*"take"'),
 (4,
  u'0.571*"run" + 0.402*"softly" + 0.391*"river" + 0.383*"blue" + 0.169*"cool" + 0.169*"deep" + 0.167*"darlin" + 0.167*"asleep" + -0.150*"cry" + -0.091*"tell"')]

In [43]:
topics_matrix = lsi.show_topics(formatted=False, num_words=10)
topics_matrix

[(0,
  [(u'got', 0.34752773991406433),
   (u'get', 0.31097541704509069),
   (u'line', 0.26711794424678126),
   (u'rock', 0.24071101094141703),
   (u'well', 0.23948183236937387),
   (u'island', 0.22982140855641484),
   (u'time', 0.20486679021930854),
   (u'good', 0.15502785269628513),
   (u'road', 0.15288514696012148),
   (u'said', 0.15181569094664693)]),
 (1,
  [(u'cry', 0.34864357636293997),
   (u'tell', 0.26878872861872843),
   (u'island', -0.26707967681043315),
   (u'rock', -0.26045609497267469),
   (u'line', -0.2461362626302907),
   (u'time', 0.20915395732049635),
   (u'ride', -0.16835451154099848),
   (u'road', -0.16642676334023965),
   (u'got', -0.15459607045045556),
   (u'town', 0.15354216559579018)]),
 (2,
  [(u'high', 0.64352240169291275),
   (u'rising', 0.3988995255810856),
   (u'feet', 0.39648956834024901),
   (u'water', 0.30921542090188286),
   (u'two', 0.14418482982574898),
   (u'five', 0.12274965792544419),
   (u'mama', 0.12256504482692511),
   (u'papa', 0.122083093129313

In [44]:
jsonified = json.dumps(topics_matrix)
with open('./corpus/lyrics_lsa.json', 'w') as handle:
    handle.write(jsonified)

### Latent dirichlet allocation
Allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. For example, if observations are words collected into documents, it posits that each document is a mixture of a small number of topics and that each word's creation is attributable to one of the document's topics.
https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

In [45]:
%time lda = models.LdaModel(corpus_doc2bow, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100)

CPU times: user 20.7 s, sys: 144 ms, total: 20.8 s
Wall time: 22.2 s


In [46]:
print(lda[corpus_doc2bow[1]])

[(4, 0.99051658090152517)]


In [47]:
topics = lda.print_topics(50, num_words=10)

In [48]:
topics

[(0,
  u'0.018*played + 0.014*got + 0.014*country + 0.013*boy + 0.013*boogie + 0.013*luther + 0.011*good + 0.010*old + 0.010*man + 0.010*woogie'),
 (1,
  u'0.012*guns + 0.012*home + 0.011*town + 0.010*still + 0.010*someone + 0.010*miss + 0.008*long + 0.008*take + 0.008*heart + 0.008*love'),
 (2,
  u'0.018*got + 0.017*line + 0.014*drink + 0.014*well + 0.013*know + 0.013*rock + 0.013*island + 0.012*heart + 0.011*love + 0.009*mine'),
 (3,
  u'0.021*high + 0.017*run + 0.014*guess + 0.013*feet + 0.013*rising + 0.013*dream + 0.013*softly + 0.013*blue + 0.011*well + 0.011*river'),
 (4,
  u'0.025*time + 0.022*get + 0.018*tell + 0.016*cry + 0.011*love + 0.010*got + 0.009*come + 0.009*man + 0.009*heart + 0.009*going')]

In [49]:
# This is a format you can parse more easily to use elsewhere.
topics_matrix = lda.show_topics(formatted=False, num_words=10)
topics_matrix

[(0,
  [(u'played', 0.017603008167115092),
   (u'got', 0.014344431044358544),
   (u'country', 0.014341141682969214),
   (u'boy', 0.013257531127486623),
   (u'boogie', 0.013254691503679052),
   (u'luther', 0.013254691499524849),
   (u'good', 0.011083007012542431),
   (u'old', 0.0099983980882499251),
   (u'man', 0.009997680820432512),
   (u'woogie', 0.0099953409263772268)]),
 (1,
  [(u'guns', 0.012019847623269334),
   (u'home', 0.011852985412296268),
   (u'town', 0.011254011796242348),
   (u'still', 0.0098001096441339419),
   (u'someone', 0.0097951681971523152),
   (u'miss', 0.0097939837549156378),
   (u'long', 0.008310830180967085),
   (u'take', 0.0083049893562906658),
   (u'heart', 0.0080824445470123081),
   (u'love', 0.0080141150900709807)]),
 (2,
  [(u'got', 0.018000692198410079),
   (u'line', 0.017107655423963912),
   (u'drink', 0.014432685403469318),
   (u'well', 0.013543657615898139),
   (u'know', 0.012653344951246256),
   (u'rock', 0.012652365330307778),
   (u'island', 0.01265087

In [50]:
jsonified = json.dumps(topics_matrix)
with open('./corpus/lyrics_lda.json', 'w') as handle:
    handle.write(jsonified)

In [51]:
from nltk.data import BufferedGzipFile