In [1]:
import glob
import re

INPUT_FOLDER = '../../../travelogues-corpus/17th_century/books'

def read_file(f):
  with open(f, 'r') as file:
    str = file.read()
    ascii_only = re.sub('[^A-Za-z0-9 ]+', '', str)
    return re.sub('\\s+', ' ', ascii_only)

# List names of all .txt files in the folder
filenames = [f for f in glob.glob(INPUT_FOLDER + '**/*.txt')]
texts = [ read_file(f) for f in filenames ]

f'Loaded {len(texts)} texts'

'Loaded 204 texts'

In [3]:
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import itertools
import matplotlib.pyplot as plt
import string

stop_words = stopwords.words('german')
stop_words.extend(['allein', 'auff', 'fich', 'feinen', 'gleich', 'vnd', 'vnder', 'mehr', 'ganz', 'kamen', 'kommen', 'wann', 'vber', 'vnnd', 'fr', 'vi', 'vn', 'vns', 'daher', 'danach', 'darnach', 'denen', 'deren', 'statt', 'war', 'wegen', 'wider', 'worden', 'gleich', 'wenig'])

# We'll do stemming to avoid duplicate variants
porter = PorterStemmer()

def stem_message(text):
  words = word_tokenize(text)

  stemmed_words = []
  for word in words:
    stemmed_words.append(porter.stem(word))
    
  # Remove links and stopwords here
  clean_tokens = [w for w in stemmed_words if not w in stop_words and len(w) > 3] 
  return ' '.join(clean_tokens)

cleaned_texts = []
for text in texts:
  # Remove punctutation
  text = text.translate(str.maketrans('', '', string.punctuation))
  cleaned_texts.append(stem_message(text))

[nltk_data] Downloading package stopwords to /home/simonr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(cleaned_texts).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

number_of_topics = 8
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
model.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=8, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [6]:
import pandas as pd

def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

no_top_words = 20
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights
0,stadt,6932.6,diee,7935.0,diee,4077.7,solch,6495.3,dito,110.2,berg,759.9,dief,1802.6,excel,40.0
1,solch,6322.5,dieer,4182.9,olch,2879.9,berg,4815.4,bekamen,46.2,stadt,741.4,fein,1686.1,diee,19.6
2,wollt,3937.9,stadt,3109.6,dieer,2810.9,fein,3835.2,insel,43.1,kirchen,538.6,diefer,995.9,dieer,17.4
3,wurd,3736.3,uner,2870.5,waer,2106.6,stadt,3360.2,windt,38.2,fein,524.3,folch,979.1,stadt,14.6
4,wrde,3655.3,olch,2285.2,berg,1936.8,waffer,3278.9,admir,37.8,oben,409.2,stadt,905.4,solch,10.4
5,htte,3649.3,dieem,2161.2,dieem,1853.5,christen,3194.7,nachen,36.9,kirch,370.4,fehr,891.8,fein,10.0
6,fein,3374.4,dieen,2047.6,geween,1632.0,herrn,3174.1,abendt,32.7,zeiten,303.2,ware,805.6,olch,9.5
7,tage,3261.4,tage,1745.4,daelbt,1400.3,heiligen,2936.9,tage,24.6,item,300.8,wurd,752.0,dieem,9.3
8,davon,3227.4,waer,1496.5,dieen,1355.4,herr,2881.9,bantam,24.5,anno,297.9,alfo,696.9,berg,8.6
9,find,3197.9,japan,1492.6,trcken,1292.9,kirchen,2879.7,diee,24.0,hand,293.3,diefem,651.0,inel,8.5
