In [1]:
import glob
import re

INPUT_FOLDER = '../../../travelogues-corpus/17th_century/books'

def read_file(f):
  with open(f, 'r') as file:
    str = file.read()
    ascii_only = re.sub('[^A-Za-z0-9 ]+', '', str)
    return re.sub('\\s+', ' ', ascii_only)

filenames = [f for f in glob.glob(INPUT_FOLDER + '**/*.txt')]
texts = [ read_file(f) for f in filenames ]

f'Loaded {len(texts)} texts'

'Loaded 204 texts'

In [2]:
extra_stopwords = []

with open('stopwords.txt') as f:
  extra_stopwords = f.readlines()
  extra_stopwords = [ x.strip() for x in extra_stopwords ]

In [3]:
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = stopwords.words('german')
stop_words.extend(extra_stopwords)
stop_words.sort()

[nltk_data] Downloading package stopwords to /home/simonr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import itertools
import matplotlib.pyplot as plt
import string

# We'll do stemming to avoid duplicate variants
porter = PorterStemmer()

def stem_message(text):
  words = word_tokenize(text)

  stemmed_words = []
  for word in words:
    stemmed_words.append(porter.stem(word))
    
  # Remove stopwords here
  clean_tokens = [w for w in stemmed_words if not w in stop_words and len(w) > 3] 
  return ' '.join(clean_tokens)

cleaned_texts = []
for text in texts:
  # Remove punctutation
  text = text.translate(str.maketrans('', '', string.punctuation))
  cleaned_texts.append(stem_message(text))

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')
tf = vectorizer.fit_transform(cleaned_texts).toarray()

tf_feature_names = vectorizer.get_feature_names()

number_of_topics = 8
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
model.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=8, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [6]:
import pandas as pd

def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

no_top_words = 20
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights
0,excel,43.2,stadt,5649.4,knigreich,630.5,stadt,3743.0,admir,1278.9,berg,5702.8,stadt,2711.0,stadt,1202.7
1,stadt,16.8,herrn,2018.3,glauben,468.7,laen,2564.3,hollnder,1161.0,trcken,3618.9,theil,2071.2,tage,860.1
2,trcken,12.3,tage,1727.3,christen,405.4,ondern,2478.0,indien,1074.2,kirchen,3217.9,dieselb,1778.9,menschen,775.9
3,stund,11.3,waffer,1680.1,herrn,361.1,tage,2101.5,volck,825.8,heiligen,3156.8,tage,1671.4,feind,699.1
4,tage,10.9,sehen,1642.6,patr,338.8,inel,1887.7,nachen,774.6,herrn,2734.1,fiel,1599.0,waffer,608.4
5,allhier,10.7,zwei,1524.1,kirchen,323.8,ehen,1801.2,wind,737.7,herr,2722.8,seyn,1574.5,welt,603.7
6,angelangt,10.2,fast,1517.8,namen,323.1,meilen,1759.0,theil,640.4,jerusalem,2486.4,feyn,1531.6,insel,579.5
7,theil,9.6,herzog,1477.3,christlichen,311.4,elbt,1717.5,mittag,583.0,christen,2424.5,thun,1525.6,sagt,559.6
8,berg,9.3,reis,1470.6,diser,290.6,berg,1640.2,insel,552.4,stadt,2285.2,indien,1420.3,sehen,482.3
9,heut,8.9,herr,1437.5,pater,280.3,wolt,1581.6,bekommen,538.3,heilig,2256.6,sachen,1408.9,sollt,474.5
