In [5]:
import glob
import re

INPUT_FOLDER = '../../../travelogues-corpus/17th_century/books'

def read_file(f):
  with open(f, 'r') as file:
    str = file.read()
    ascii_only = re.sub('[^A-Za-z0-9 ]+', '', str)
    return re.sub('\\s+', ' ', ascii_only)

# List names of all .txt files in the folder
filenames = [f for f in glob.glob(INPUT_FOLDER + '**/*.txt')]
texts = [ read_file(f) for f in filenames ]

f'Loaded {len(texts)} texts'

'Loaded 204 texts'

In [6]:
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

stop_words = stopwords.words('german')
stop_words.extend(['allein', 'auff', 'fich', 'feinen', 'gleich', 'vnd', 'vnder', 'mehr', 'ganz', 'kamen', 'kommen', 'wann', 'vber', 'vnnd', 'fr', 'vi', 'vn', 'vns', 'daher', 'danach', 'darnach', 'denen', 'deren', 'statt', 'war', 'wegen', 'wider', 'worden', 'gleich', 'wenig'])

def clean(text):
  preprocessed = [ ''.join(simple_preprocess(str(token))) for token in text.split() ]
  return [ token for token in preprocessed if token not in stop_words and len(token) > 3 ]

cleaned = [ clean(text) for text in texts ]

[nltk_data] Downloading package stopwords to /home/simonr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

dictionary = Dictionary(cleaned)
corpus = [dictionary.doc2bow(text) for text in cleaned]

# Train the model on the corpus.
# lda = LdaModel(common_corpus, num_topics=20)
lda = LdaModel(corpus, num_topics=5, id2word=dictionary)
# lda.save('lda_model.16C.gensim')

In [8]:
lda.top_topics(corpus)

[([(0.0019230013, 'knig'),
   (0.001787934, 'stadt'),
   (0.0017269207, 'zeit'),
   (0.0015985918, 'etliche'),
   (0.0013244621, 'groen'),
   (0.0012451018, 'land'),
   (0.0012030047, 'groe'),
   (0.0011408206, 'schiff'),
   (0.0010884621, 'weit'),
   (0.0010447559, 'jahr'),
   (0.0009693332, 'knnen'),
   (0.0009433851, 'meer'),
   (0.0008643173, 'eben'),
   (0.0008494, 'berg'),
   (0.0008400375, 'mann'),
   (0.00082407426, 'ganze'),
   (0.0008144192, 'wrde'),
   (0.0007794815, 'hernach'),
   (0.00077237905, 'bald'),
   (0.0007703979, 'nacht')],
  -0.059694653977874886),
 ([(0.0024045776, 'etliche'),
   (0.0019293507, 'stadt'),
   (0.0018215909, 'knig'),
   (0.00135332, 'groe'),
   (0.0013246046, 'zeit'),
   (0.0012826705, 'land'),
   (0.0012655357, 'groen'),
   (0.001154905, 'nachdem'),
   (0.0010148024, 'dieer'),
   (0.0010028678, 'schiff'),
   (0.0009986707, 'diee'),
   (0.0009794005, 'jahr'),
   (0.0009687943, 'davon'),
   (0.00092161336, 'meer'),
   (0.00091000245, 'knnen'),
   (0