In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import requests
# from tqdm import tqdm
import os
from datetime import datetime

# Downloading enwikivoyage-latest-pages-articles.xml.bz2

In [19]:
import requests
import os
from tqdm import tqdm  # for progress bar

def download_wikivoyage_dump(dest_dir):
    # URL of the dump file
    url = "https://dumps.wikimedia.org/enwikivoyage/latest/enwikivoyage-latest-pages-articles.xml.bz2"
    
    # Send a GET request with stream=True to download in chunks
    response = requests.get(url, stream=True)
    
    # Get total file size from headers
    total_size = int(response.headers.get('content-length', 0))
    
    # Open local file to write the content
    filename = "enwikivoyage-latest-pages-articles.xml.bz2"
    
    # Show download progress
    with open(f"{dest_dir}/{filename}", 'wb') as file, tqdm(
        desc=filename,
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as progress_bar:
        for data in response.iter_content(chunk_size=1024):
            size = file.write(data)
            progress_bar.update(size)

    return filename

# Usage
if __name__ == "__main__":
    download_wikivoyage_dump(dest_dir="data/raw")  # or

enwikivoyage-latest-pages-articles.xml.bz2: 100%|██████████████| 117M/117M [08:40<00:00, 235kiB/s]


# Parsing wikivoygage dump

In [118]:
import bz2
import json
import xml.etree.ElementTree as ET
import os
from tqdm import tqdm

            
class WikivoyageParser():
    def __init__(self, src_path):
        self.src_path = src_path
        self.namespace = {'mw': 'http://www.mediawiki.org/xml/export-0.11/'}   
        

    def _ensure_folder_exists(self, src_path):
        try:
            # Check if folder exists
            if not os.path.exists(src_path):
                # Create folder
                os.makedirs(src_path)
                print(f"Created folder: {src_path}")
            else:
                print(f"Folder already exists: {src_path}")
        except Exception as e:
            print(f"Error creating folder: {e}")
            return None

    
    def bz2_uncompress(self, src_path):
        with bz2.open(src_path, 'rt', encoding='utf-8') as file:
            # Parse XML content
            tree = ET.fromstring(file.read())
            return tree

    
    def parse_wikivoyage_articles(self, tree, dest_path, region, limit=100, show_articles_preview=True):
        self._ensure_folder_exists(dest_path)
        # Define namespace
        namespace = {'mw': 'http://www.mediawiki.org/xml/export-0.11/'}
        # Iterate through all pages
        counter = 0
        for page in tqdm(tree.findall('.//mw:page', namespace)[:limit]):
            title = page.find('mw:title', namespace).text
            text = page.find('.//mw:text', namespace).text
            if text and region.lower() in text.lower():
                if show_articles_preview:
                    print(f"Title: {title}")
                    print("#" * 50)
                article = {
                    'title': title,
                    'content': text
                }
                with open(f"{dest_path}/article_{counter}.json", "w") as a:
                    json.dump(article, a)
                counter += 1
        articles = os.listdir(dest_path)
        print(f"Total articles found for {region}: {len(articles)}")

In [119]:
# Uncompressing wikivoyage data
src_path = "data/raw/enwikivoyage-latest-pages-articles.xml.bz2"
parser = WikivoyageParser(src_path)
# wikivoyage = parser.bz2_uncompress(src_path)

In [122]:
# Testing the parser
dest_path = "data/wikivoyage"
parser.parse_wikivoyage_articles(wikivoyage, dest_path, region="South America", limit=1000, show_articles_preview=True)

Created folder: data/wikivoyage


100%|██████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 17074.24it/s]

Title: Acapulco
##################################################
Title: Adelaide
##################################################
Title: Adrogué
##################################################
Title: Africa
##################################################
Title: Aggressive dogs
##################################################
Title: Agritourism
##################################################
Title: Alcamo
##################################################
Title: Downhill snowsports
##################################################
Title: Alps
##################################################
Title: Altiplano (Peru)
##################################################
Title: Altitude sickness
##################################################
Title: Amarante
##################################################
Title: Amazon (Ecuador)
##################################################
Title: Amazonas (Venezuela)
##################################################
Title: Amazon




In [123]:
# Downloading all wikivoyage articles
!rm -rf data/wikivoyage/
parser.parse_wikivoyage_articles(wikivoyage, dest_path, region="South America", limit=None, show_articles_preview=False)

Created folder: data/wikivoyage


100%|████████████████████████████████████████████████████████| 76177/76177 [00:02<00:00, 27926.07it/s]

Total articles found for South America: 891





# Checking wikivoyage data

In [124]:
import os

wd = os.listdir(dest_path)
print(len(wd))
wd[:2]

891


['article_597.json', 'article_374.json']

In [125]:
import json

# Looking at the first 50 articles
for i in wd[:10]:
    a = open(f"data/wikivoyage/{i}", "r").read()
    a = json.loads(a)
    print(f"Title: {a['title']}")
    print(f"Article preview: {a['content'][:200]}")
    print("-----------------------------------------------")

Title: Gambling
Article preview: {{pagebanner|Gambling banner Sahara Hotel.jpg|caption=Roulette}}

'''Gambling''' with high (or even moderate) stakes is outlawed in many parts of the world. Some places that allow such gambling draw t
-----------------------------------------------
Title: Sandwich
Article preview: {{pagebanner|Disambiguation banner.png}}
__NOTOC__
There is more than one place called '''Sandwich''':

===[[United Kingdom]]===

* [[Sandwich (England)]] - A town in [[Kent]], [[England]].

===[[Unit
-----------------------------------------------
Title: Wikivoyage:Wikivoyagers by location
Article preview: {{pagebanner|pgname=Wikivoyagers by location|Generic banner country.jpg}}
Wikivoyage is a truly global community; the map below should show that! Please feel free to add your user name to the lists be
-----------------------------------------------
Title: São Paulo/West Side
Article preview: {{pagebanner|Marginal Pinheiros SAO 07 2009 6074 (cropped).jpg}}
[[Image:S%C3%A3o_P