# Data scraping

This notebook was used to scrap all the files of interest on the [SLF archive website](https://www.slf.ch/fr/bulletin-davalanches-et-situation-nivologique/archives.html?tx_wslavalanches_archiv%5Bpath%5D=%2Fuser_upload%2Fimport%2Flwdarchiv%2Fpublic%2F&tx_wslavalanches_archiv%5Baction%5D=showArchiv&tx_wslavalanches_archiv%5Bcontroller%5D=Avalanche&cHash=c71751a643ec4629e21b0306033ccd59). We first define some utility functions to extract all the directory hierarchy from the HTML pages. Then we filter which files we are interested into and dump their URL into a [file](./files_to_download). We finally download each file using a small multithreaded [python script](../src/download.py).

In [1]:
import requests
from bs4 import BeautifulSoup
import json
from typing import Dict, List
import pandas as pd

%matplotlib inline

In [2]:
base_url = 'https://www.slf.ch/'
archive_parent = 'fr/bulletin-davalanches-et-situation-nivologique/archives.html?tx_wslavalanches_archiv%5Bpath%5D=%2Fuser_upload%2Fimport%2Flwdarchiv%2Fpublic%2F&tx_wslavalanches_archiv%5Baction%5D=showArchiv&tx_wslavalanches_archiv%5Bcontroller%5D=Avalanche&cHash=c71751a643ec4629e21b0306033ccd59'

In [3]:
def extract_folders(url: str) -> Dict[str, str]:
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    folders = [folder.find('a') for folder in soup.findAll(class_='folder')]
    return {str(folder.contents[2]).split(',')[0].strip(): folder['href'] for folder in folders}

def extract_folders_rec(url: str, max_level: int = 3, curr_level: int = 0):
    full_url = base_url + url
    subfolders = extract_folders(full_url)
    if curr_level < max_level:
        for subfolder in subfolders:
            subfolder_url = subfolders[subfolder]
            subfolders[subfolder] = extract_folders_rec(subfolder_url, max_level, curr_level + 1)
    return subfolders

In [4]:
# show first level of archive
hierarchy = extract_folders_rec(base_url + archive_parent, max_level=1)
hierarchy = [(folder, subfolder, subfolder_url) for folder, content in hierarchy.items() for subfolder, subfolder_url in content.items()]
hierarchy = pd.DataFrame(data, columns=['folder', 'subfolder', 'url'])
hierarchy = hierarchy.set_index(['folder', 'subfolder'])
hierarchy

NameError: name 'data' is not defined

The following functions are filters for which folders to extract. It follows the following rules:

- **language:** files are often duplicated for the 4 languages (de, fr, it, en). When it is the case we download only one set in the following order of preference: en - fr - de. German is the default (always present).
- **too specific:** some files are not interesting for now (too specific or too regional). We don't download the snowprofiles and the regional snow report,
- **color or black and white:** maps are available in color and in black and white. Colors are easier for computer vision algorithm, so we drop the black and white map.

In [None]:
def folders_filter(folders: Dict[str, str]) -> Dict[str, str]:
    # language picking
    if 'en' in folders:
        return {'en': folders['en']}
    if 'fr' in folders:
        return {'fr': folders['fr']}
    if 'de' in folders:
        return {'de': folders['de']}
    
    new_folders = folders.copy()
    for key in folders:
        if 'regional' in key.lower() or 'régional' in key.lower():
            new_folders.pop(key)
        if 'icône' in key.lower() or 'icone' in key.lower():
            new_folders.pop(key)
        if 'Schneedeckenstabilität' in key:
            new_folders.pop(key)
    return new_folders

def files_filter(files_url: List[str]) -> List[str]:
    new_files_url = []
    for fu in files_url:
        f = path.basename(fu)
        if 'bw' not in f or 'bw.txt' in f:
            new_files_url.append(fu)
    return new_files_url

In [None]:
from os import path
import os

def extract_files(url: str):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    files = [folder.find('a')['href'] for folder in soup.findAll(class_='linkedListPoint')]
    return files

# u = 'https://www.slf.ch/fr/bulletin-davalanches-et-situation-nivologique/archives.html?tx_wslavalanches_archiv%5Bpath%5D=%2Fuser_upload%2Fimport%2Flwdarchiv%2Fpublic%2F2018%2Fhstop%2Ffr%2Fgif%2F&tx_wslavalanches_archiv%5Baction%5D=showArchiv&tx_wslavalanches_archiv%5Bcontroller%5D=Avalanche&cHash=3a2d286c7428ec5abc465a7412ad6f65'
# extract_files(u)

def fetch_all(url: str, dest: str, curr_path: str = '', count: int = 0):
    fs = extract_files(url)
    fs = files_filter(fs)
    for file_url in fs:
        dest_file = path.join(dest, curr_path, path.basename(file_url))
        if not os.path.exists(dest_file):
            content = requests.get(path.join(base_url, file_url)).content
            with open(dest_file, 'wb') as f:
                f.write(content)
            count += 1
        print('count {}\t{}'.format(count, path.join(curr_path, path.basename(file_url))), end='\r')

    sub_directories = extract_folders(url)
    sub_directories = folders_filter(sub_directories)
    for name, sub_url in sub_directories.items():
        new_path = path.join(curr_path, name)
        create_dir(path.join(dest, new_path))
        count = fetch_all(base_url + sub_url, dest, curr_path=new_path, count=count)
        
    return count

def create_dir(d):
    if not os.path.exists(d):
        os.makedirs(d)


In [None]:
def files_generator(url: str):
    fs = extract_files(url)
    fs = files_filter(fs)
    for file_url in fs:
        yield file_url

    sub_directories = extract_folders(url)
    sub_directories = folders_filter(sub_directories)
    for name, sub_url in sub_directories.items():
        for f in files_generator(base_url + sub_url):
            yield f
    

In [None]:
entry_point = 'https://www.slf.ch/fr/bulletin-davalanches-et-situation-nivologique/archives.html?tx_wslavalanches_archiv%5Bpath%5D=%2Fuser_upload%2Fimport%2Flwdarchiv%2Fpublic%2F&tx_wslavalanches_archiv%5Baction%5D=showArchiv&tx_wslavalanches_archiv%5Bcontroller%5D=Avalanche&cHash=c71751a643ec4629e21b0306033ccd59'
destination = '../data2/'
# no bw, en-fr-de in order, no profile, no regional, no icone

# fetch_all(entry_point, destination)
with open('files_to_download', 'w') as dest:
    dest.writelines(map(lambda x: base_url + x + '\n', files_generator(entry_point)))


Now we can use the python script `../src/dowload.py` to fetch the ~30'000 files in the directory structure.

```
python3 src/download.py notebooks/files_to_download ./data/slf --prefix https://www.slf.ch/fileadmin/user_upload/import/lwdarchiv/public/ --nproc 4
```