# Download Monthly EWS

## From January 2000 to December 2009

In [1]:
from urllib.parse import urljoin
import itertools
import os
import requests

# make diretcory to save csv files
dir_path = '../data/1_EWS_raw'
os.makedirs(dir_path, exist_ok=True)

# Base URL
base_url_from_200001_to_200912 = 'https://www5.cao.go.jp/keizai3/kako_csv/kako2_watcher.html'

# list of year and month
hyy_dic = {'2000':'h12', '2001':'h13', '2002':'h14', '2003':'h15', 
           '2004':'h16', '2005':'h17', '2006':'h18', '2007':'h19',
           '2008':'h20', '2009':'h21'}
mm_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

def DL_csv(base_url, key, val, mm, target):
    csv_url = urljoin(base_url, f'{val}{mm}_{target}.csv')
    csv_filename = f'{key}{mm}_{target}.csv'
    file_path = os.path.join(dir_path, csv_filename)

    # Download the CSV file
    csv_res = requests.get(csv_url)
    with open(file_path, 'wb') as f:
        f.write(csv_res.content)

    print(f"Download completed: {csv_filename}")
    
for (hyy_key, hyy_val), mm in itertools.product(hyy_dic.items(), mm_list):
    DL_csv(base_url_from_200001_to_200912, hyy_key, hyy_val, mm, 'watcher4') # current
    DL_csv(base_url_from_200001_to_200912, hyy_key, hyy_val, mm, 'watcher5') # future

Download completed: 200001_watcher4.csv
Download completed: 200001_watcher5.csv
Download completed: 200002_watcher4.csv
Download completed: 200002_watcher5.csv
Download completed: 200003_watcher4.csv
Download completed: 200003_watcher5.csv
Download completed: 200004_watcher4.csv
Download completed: 200004_watcher5.csv
Download completed: 200005_watcher4.csv
Download completed: 200005_watcher5.csv
Download completed: 200006_watcher4.csv
Download completed: 200006_watcher5.csv
Download completed: 200007_watcher4.csv
Download completed: 200007_watcher5.csv
Download completed: 200008_watcher4.csv
Download completed: 200008_watcher5.csv
Download completed: 200009_watcher4.csv
Download completed: 200009_watcher5.csv
Download completed: 200010_watcher4.csv
Download completed: 200010_watcher5.csv
Download completed: 200011_watcher4.csv
Download completed: 200011_watcher5.csv
Download completed: 200012_watcher4.csv
Download completed: 200012_watcher5.csv
Download completed: 200101_watcher4.csv


## From January 2010 onward

In [2]:
from bs4 import BeautifulSoup as bs
import re

# Base URL
base_url_from_201001_to_201912 = 'https://www5.cao.go.jp/keizai3/kako_watcher.html'
base_url_from_202001 = 'https://www5.cao.go.jp/keizai3/watcher_index.html'

# function to get monthly links
def get_monthly_links(base_url):
    # get monthly page bullet links
    res = requests.get(base_url)
    soup = bs(res.content, 'html.parser')
    monthly_bullet_links = soup.find_all('a', class_=re.compile('bulletLink'))
    
    # extract monthly page links
    monthly_links = []
    for monthly_bullet_link in monthly_bullet_links:
        monthly_link = monthly_bullet_link.get('href')
        if monthly_link.startswith('20'):
            if (int(monthly_link[:4])<=2009) | (monthly_link[:7]=='2010/01'): # skip data before 2009 here
                continue
            monthly_links.append(monthly_link)
    return sorted(monthly_links)

monthly_links_from_201001_to_201912 = get_monthly_links(base_url_from_201001_to_201912)
monthly_links_from_202001 = get_monthly_links(base_url_from_202001)
monthly_links = monthly_links_from_201001_to_201912 + monthly_links_from_202001

# create dictionary (key:months, value:url)
key_list = [str(int(m[:7].replace('/', ''))-1) for m in monthly_links]
key_list = [
    key[:2] + str(int(key[2:4])-1) + '12' + key[6:] if key[4:6] == '00' else key
    for key in key_list
] # For example, '201800' is modified as '201712'
url_dict = dict(zip(key_list, monthly_links))

# function to download CSVs
def download_csv(key, url, csv_link_tag):
    if csv_link_tag:
        csv_url = urljoin(url, csv_link_tag['href'])
        csv_filename = f'{key}_{os.path.basename(csv_url)}'
        file_path = os.path.join(dir_path, csv_filename)

        # Download the CSV file
        csv_res = requests.get(csv_url)
        with open(file_path, 'wb') as f:
            f.write(csv_res.content)

        print(f"Download completed: {csv_filename}")
    else:
        print(f"{key}: CSV link not found.")

# Iterate through the dictionary and download CSV files from each URL
for key, url in url_dict.items():
    res = requests.get('https://www5.cao.go.jp/keizai3/'+url)
    res.encoding = 'utf-8'
    soup = bs(res.text, 'html.parser')

    # Extract a specific CSV file links
    current_csv_link_tag = soup.find('a', href='watcher4.csv')
    future_csv_link_tag = soup.find('a', href='watcher5.csv')

    # download csv files
    download_csv(key, 'https://www5.cao.go.jp/keizai3/'+url, current_csv_link_tag)
    download_csv(key, 'https://www5.cao.go.jp/keizai3/'+url, future_csv_link_tag)

Download completed: 201001_watcher4.csv
Download completed: 201001_watcher5.csv
Download completed: 201002_watcher4.csv
Download completed: 201002_watcher5.csv
Download completed: 201003_watcher4.csv
Download completed: 201003_watcher5.csv
Download completed: 201004_watcher4.csv
Download completed: 201004_watcher5.csv
Download completed: 201005_watcher4.csv
Download completed: 201005_watcher5.csv
Download completed: 201006_watcher4.csv
Download completed: 201006_watcher5.csv
Download completed: 201007_watcher4.csv
Download completed: 201007_watcher5.csv
Download completed: 201008_watcher4.csv
Download completed: 201008_watcher5.csv
Download completed: 201009_watcher4.csv
Download completed: 201009_watcher5.csv
Download completed: 201010_watcher4.csv
Download completed: 201010_watcher5.csv
Download completed: 201011_watcher4.csv
Download completed: 201011_watcher5.csv
Download completed: 201012_watcher4.csv
Download completed: 201012_watcher5.csv
Download completed: 201101_watcher4.csv
