In [1]:
import wikipedia
import sys
import datetime
import os
from multiprocessing import Process, Queue

wikipedia.set_lang('de')
wikipedia.set_rate_limiting(True, min_wait=datetime.timedelta(0, 0, 50000))

In [2]:
# Lädt rekursiv alle verlinkten Artikel bis die maximale Tiefe erreicht ist
def load_recursive(page, queue, depth=2, with_summary=False, scanned=set()):
    if depth <= 0 or (page in scanned and depth == 1):
        #print('skipping %s because it already exists and no further depths are being analyzed anyway'%(page))
        return scanned
    scanned.add(page)

    try:
        loaded_page = wikipedia.page(page)
        if 'Kategorie:Tag' in loaded_page.categories:
            print("skipped %s"%(page))
            return scanned
            
        for sub_page in loaded_page.links:
            scanned = load_recursive(sub_page, queue, depth - 1, with_summary, scanned)
            
        content = loaded_page.content
        if content:
            queue.put({"depth":depth, "page":page, "content":loaded_page.content}, block=True, timeout=None)
        if with_summary:
            queue.put({"depth":depth, "page":page, "content":loaded_page.summary}, block=True, timeout=None)

    except wikipedia.DisambiguationError:
        scanned = load_recursive_by_search(page, queue, depth - 1, with_summary, scanned)
    except wikipedia.PageError:
        return scanned

    #sys.stdout.write("\r%i pages scanned" % (len(scanned)))
    return scanned

In [3]:
# Sucht nach dem übergebenen Begriff und führt die rekursive auf jedes der Ergebnisse aus
def load_recursive_by_search(search, queue, depth=1, with_summary=False, scanned = set()):
    search_result = wikipedia.search(search)
    for result in search_result:
        scanned = load_recursive(result, queue, depth, with_summary, scanned)
    return scanned

In [10]:
def search_process(search_list, queue, depth, with_summary):
    scanned = set(os.listdir('collected_articles/'))
    #print("pages found: %s"%(scanned))
    for search in search_list:
        print("starting with %s"%(search))
        scanned = load_recursive_by_search(search, queue, depth, with_summary, scanned)
        print("%i sites have been scanned"%(len(scanned)))    
    queue.put(None, block=True, timeout=None)

In [11]:
def start_async_scraping(search_list, max_queue_size = 100, depth = 3, with_summary=False):
    page_queue = Queue(maxsize=max_queue_size)
    p = Process(target=search_process, args=(search_list,page_queue,depth,with_summary,))
    p.start()
    return p, page_queue

In [14]:
original_categories = ['freizeit', 
           'unterhaltung', 
           'shopping', 
           'einkaufen', 
           'essen', 
           'lebensmittel', 
           'bar', 
           'restaurant', 
           'gesundheit', 
           'drogerie', 
           'abonnement', 
           'spende', 
           'berufsausgaben',
           'einzelhändler',
           'bildung', 
           'familie', 
           'freunde', 
           'gehalt', 
           'haushalt', 
           'nebenkosten', 
           'medien', 
           'elektronik',
           'reisen',
           'urlaub',            
           'sparen', 
           'investieren', 
           'steuern', 
           'abgaben', 
           'transport',
           'auto', 
           'versicherungen',
           'finanzen'
          ]
categories = [
           'bildung',
           'abonnement',
           'elektronik',
           'reisen',
           'urlaub',
           'versicherungen',
           'einzelhändler',
           'Ernährung des Menschen',
           'Nahrung',
           'Gericht (Speise)',
           'lebensmittel', 
           'bar', 
           'restaurant', 
           'gesundheit', 
           'drogerie',
           'spende', 
           'berufsausgaben',
           'familie', 
           'freunde', 
           'gehalt', 
           'haushalt', 
           'nebenkosten', 
           'medien',             
           'sparen', 
           'investieren', 
           'steuern', 
           'abgaben', 
           'transport',
           'auto', 
           'finanzen'
          ]

In [None]:
process, queue = start_async_scraping(categories)
for document in iter(queue.get, None):
    print("depth %i loaded page %s with length %i"%(document['depth'], document['page'], len(document['content'])))
    f = open('collected_articles/'+document['page'].replace("/",""), "w")
    f.write(document['content'])    