# Art DATIS: Data Analysis

In [22]:
# 1. load OCRed texts
import glob
txts_path = '../artdatis/tagging/OCRed/typed/'
# filter out and collect text files into a corpus of documents
text_corpus = []
for file_path in glob.glob(txts_path+'*_text.txt'):
    with open(file_path) as file:
        text_corpus.append(file.read())
print("Loaded %d documents"%len(text_corpus))

Loaded 8308 documents


In [53]:
# 2. pre-processing: remove stopwords, split into words
import urllib.request
from pprint import pprint

def load_word_list(lang='en'):
    url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-%s/master/stopwords-%s.txt' % (lang, lang)
    print(url)
    stopwords = urllib.request.urlopen(url).read().decode('UTF-8').split()
    print("Loaded %s stopwords, e.g. %s" % (lang, ", ".join(stopwords[:2])))
    return set(stopwords)


# load stopwords
en_stoplist = load_word_list('en')
de_stoplist = load_word_list('de')
nl_stoplist = load_word_list('nl')
fr_stoplist = load_word_list('fr')

stoplist = en_stoplist | de_stoplist | nl_stoplist | fr_stoplist

# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]

https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt
Loaded en stopwords, e.g. 'll, 'tis
https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/stopwords-de.txt
Loaded de stopwords, e.g. a, ab
https://raw.githubusercontent.com/stopwords-iso/stopwords-nl/master/stopwords-nl.txt
Loaded nl stopwords, e.g. aan, aangaande
https://raw.githubusercontent.com/stopwords-iso/stopwords-fr/master/stopwords-fr.txt
Loaded fr stopwords, e.g. a, abord


In [59]:
# Count word frequencies
word_list = [word for text in texts for word in text if word.isalpha()]

from collections import Counter
Counter(word_list).most_common()

[('glass', 3414),
 ('glas', 3286),
 ('museum', 2222),
 ('heer', 2032),
 ('jaar', 2016),
 ('kunst', 1964),
 ('werk', 1845),
 ('nieuwe', 1762),
 ('grote', 1721),
 ('vormgeving', 1683),
 ('art', 1672),
 ('amsterdam', 1610),
 ('aantal', 1535),
 ('oe', 1283),
 ('uur', 1275),
 ('zeer', 1260),
 ('ter', 1139),
 ('crafts', 1132),
 ('goed', 1097),
 ('tijd', 1086),
 ('school', 1076),
 ('leerdam', 1067),
 ('mogelijk', 1004),
 ('beeldende', 995),
 ('di', 987),
 ('valkema', 977),
 ('vorm', 961),
 ('plaats', 952),
 ('enkele', 887),
 ('eee', 868),
 ('gaat', 860),
 ('waarin', 846),
 ('kunstenaars', 834),
 ('jaren', 822),
 ('council', 819),
 ('juni', 818),
 ('werken', 806),
 ('bestuur', 798),
 ('dagen', 798),
 ('april', 793),
 ('gaan', 792),
 ('rietveld', 789),
 ('september', 789),
 ('groot', 786),
 ('academie', 782),
 ('design', 773),
 ('ontwerpen', 768),
 ('oo', 762),
 ('staat', 756),
 ('komt', 745),
 ('leden', 733),
 ('lessen', 732),
 ('vereniging', 730),
 ('komen', 729),
 ('nederlandse', 729),
 ('co