In [69]:
import pandas as pd
import nltk
from nltk import word_tokenize
from gensim import models
from gensim.corpora import Dictionary, MmCorpus

In [71]:
import ast
text_file = open("anarchism_sentences.txt", "r")
df = pd.DataFrame(ast.literal_eval(text_file.read()), columns=['sentences'])
df['idx'] = df.index
df

Unnamed: 0,sentences,idx
0,'''Anarchism''' is an and that r...,0
1,"These institutions are often described as , a...",1
2,Anarchism's central disagreement with other id...,2
3,"Anarchism is usually placed on the of the ,...",3
4,As anarchism does not offer a fixed body of do...,4
...,...,...
268,"It also states that ""some sort of 'court' syst...",268
269,=== Industrial civilisation === In his essay '...,269
270,=== Tacit authoritarianism === The anarchist t...,270
271,== List of anarchist societies == * (Fejuv...,271


In [72]:
df['tokens'] = df['sentences'].apply(word_tokenize)
df

Unnamed: 0,sentences,idx,tokens
0,'''Anarchism''' is an and that r...,0,"[``, 'Anarchism, '', ', is, an, and, that, rej..."
1,"These institutions are often described as , a...",1,"[These, institutions, are, often, described, a..."
2,Anarchism's central disagreement with other id...,2,"[Anarchism, 's, central, disagreement, with, o..."
3,"Anarchism is usually placed on the of the ,...",3,"[Anarchism, is, usually, placed, on, the, of, ..."
4,As anarchism does not offer a fixed body of do...,4,"[As, anarchism, does, not, offer, a, fixed, bo..."
...,...,...,...
268,"It also states that ""some sort of 'court' syst...",268,"[It, also, states, that, ``, some, sort, of, '..."
269,=== Industrial civilisation === In his essay '...,269,"[===, Industrial, civilisation, ===, In, his, ..."
270,=== Tacit authoritarianism === The anarchist t...,270,"[===, Tacit, authoritarianism, ===, The, anarc..."
271,== List of anarchist societies == * (Fejuv...,271,"[==, List, of, anarchist, societies, ==, *, (,..."


In [73]:
def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
  print('Building dictionary...')
  dictionary = Dictionary(docs)
  stopwords = nltk_stopwords().union(additional_stopwords)
  stopword_ids = map(dictionary.token2id.get, stopwords)
  dictionary.filter_tokens(stopword_ids)
  dictionary.compactify()
  dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
  dictionary.compactify()

  print('Building corpus...')
  corpus = [dictionary.doc2bow(doc) for doc in docs]

  return dictionary, corpus

In [74]:
dictionary, corpus = prep_corpus(df['tokens'])

Building dictionary...
Building corpus...


In [75]:
MmCorpus.serialize('wiki_articles.mm', corpus)
dictionary.save('wiki_articles.dict')

In [78]:
%%time
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, passes=50)
                                      
lda.save('anarchism.model')

CPU times: user 5.37 s, sys: 50.7 ms, total: 5.42 s
Wall time: 5.39 s


In [79]:
lda.show_topics(formatted=False)

[(0,
  [('anarchism', 0.08977785),
   ('anarchist', 0.056576528),
   ('anarchists', 0.054472953),
   ('===', 0.05154044),
   ('==', 0.040628247),
   (')', 0.039753832),
   ('(', 0.037455518),
   ('In', 0.029284198),
   ('within', 0.027041629),
   ('thought', 0.020607496)]),
 (1,
  [('``', 0.14375609),
   ("''", 0.0865471),
   ('(', 0.045218855),
   (')', 0.042639136),
   ('based', 0.031342953),
   ('====', 0.029287579),
   ('state', 0.02899939),
   ('also', 0.02685136),
   ('society', 0.025332574),
   ('anarchism', 0.025321981)]),
 (2,
  [('The', 0.059738137),
   ('anarchist', 0.05948165),
   ("'s", 0.053630676),
   ('state', 0.041375745),
   ('social', 0.034264117),
   ('workers', 0.032887902),
   ('many', 0.032784242),
   ('movement', 0.028750008),
   ('anarchism', 0.027595628),
   ('violence', 0.026789036)])]