This script downloads a given wikipedia dump (if it doesn't exist locally yet), and transforms from its compressed xml data the content into a single simple txt file

In [1]:
import os
import requests
import subprocess
from gensim.corpora import WikiCorpus
from IPython.utils import io

## config

In [2]:
WIKI_DUMP_URL = "https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2"
OUTPUT_TEXT_PATH = "/veld/input/dewiki.txt"

wiki_dump_id = WIKI_DUMP_URL.split("/")[-1]
wiki_dump_path = f"/veld/input/{wiki_dump_id}"

print(f"wiki_dump_id: {wiki_dump_id}")
print(f"wiki_dump_path: {wiki_dump_path}");

wiki_dump_id: dewiki-latest-pages-articles.xml.bz2
wiki_dump_path: /veld/input/dewiki-latest-pages-articles.xml.bz2


## download wikipedia dump if it doesn't exist locally

can take a while, depending on the dump. The english dump has around 22GB, the german 6.5 GB

In [3]:
if not os.path.exists(wiki_dump_path):
    print("wiki dump does not exist yet locally. Downloading.")        
    
    # piping stdout to DEVNULL, so that this jupyter cell doesn't get cluttered
    subprocess.call(
        ["wget", WIKI_DUMP_URL, "-O", wiki_dump_path], 
        stdout=subprocess.DEVNULL, 
        stderr=subprocess.STDOUT
    )
    print("Done with downloading. Size:")       
    _ = subprocess.run(["du", "-sh", wiki_dump_path])
else:
    print("wiki dump does already exist locally. Not downloading.")

wiki dump does not exist yet locally. Downloading.
Done with downloading. Size:
6.5G	/veld/input/dewiki-latest-pages-articles.xml.bz2


## stream from wikipedia dump and append to txt file

can also take a while

In [5]:
wiki_corpus_streaming = WikiCorpus(wiki_dump_path, dictionary=False)

# delete content of file
with open(OUTPUT_TEXT_PATH, "w") as f:
    f.write("")

# append to file iteratively to save RAM
with open(OUTPUT_TEXT_PATH, "a") as f:
    for text in wiki_corpus_streaming.get_texts():
        t = " ".join(text)    
        f.write(t)