<a href="https://colab.research.google.com/github/takiyama0617/technical_analysis/blob/master/%E6%9C%89%E5%A0%B1%E3%82%AD%E3%83%A3%E3%83%83%E3%83%81%E3%83%A3%E3%83%BC%E3%81%8B%E3%82%89XBRL%E3%81%AEURL%E3%82%92%E5%8F%96%E5%BE%97%E3%81%99%E3%82%8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# １．有報キャッチャーからURLを取得
XBRLのURLリスト dat_download{ str_period }.csv が吐き出される

In [0]:
import os
import csv
import time
import re
import requests
import xml.etree.ElementTree as ET
from tqdm import tqdm
from datetime import datetime, timedelta


class yuho_catcher():
    def __init__(self, since, until, base_dir=None):
        self.csv_tag = ['id', 'title', 'cd', 'url', 'update']
        self.encode_type = 'utf-8'
        self.wait_time = 2  # 間隔が短いと制限がかかる
        self.base_url = 'http://resource.ufocatch.com/atom/edinetx/'
        self.namespace = '{http://www.w3.org/2005/Atom}'
        self.out_of_since = False
        self.since = since
        self.until = until
        self.file_info_str = since.strftime(
            '_%y%m%d_') + until.strftime('%y%m%d')
        self.base_path = f'{ os.getcwd() if base_dir==None else base_dir }'

    def get_link_info_str(self, ticker_symbol):
        url = self.base_url + str(ticker_symbol)
        count, retry = 0, 3
        while True:
            try:
                response = requests.get(url)
                return response.text
            except Exception:
                print(f'{ticker_symbol} のアクセスに失敗しました。[ {count} ]')
                if count < retry:
                    count += 1
                    time.sleep(3)
                    continue
                else:
                    raise

    def parse_xml(self, string):
        ET_tree = ET.fromstring(string)
        ET.register_namespace('', self.namespace[1:-1])
        return ET_tree

    def get_link(self, tree):
        yuho_dict = {}
        # xmlのentry毎にfor
        for el in tree.findall('.//'+self.namespace+'entry'):
            title = el.find(self.namespace+'title').text
            if not self.is_yuho(title):
                continue
            updated = el.find(self.namespace+'updated').text
            checked = self.time_check(updated)
            if not checked['until']:
                continue
            if not checked['since']:
                self.out_of_since = True
                return yuho_dict
            _id = el.find(self.namespace+'id').text
            links = el.findall('./'+self.namespace+'link[@type="text/xml"]')
            for link in links:
                if '.xbrl' in link.attrib['href'] and 'PublicDoc' in link.attrib['href']:
                    url = link.attrib['href']
                    break
                else:
                    continue
            else:
                continue
            cd = re.sub(r'^【(\w+)】.*', r"\1", title)
            yuho_dict[_id] = {'id': _id, 'title': title,
                              'cd': cd, 'url': url, 'update': updated}
        return yuho_dict

    def is_yuho(self, title):
        if u'有価証券報告書' in str(title) and u'株式会社' in str(title) and u'内国投資信託受益証券' not in str(title):
            return True
        return False

    def time_check(self, update):
        updated_time = datetime.strptime(update, '%Y-%m-%dT%H:%M:%S+09:00')
        return {'since': updated_time >= self.since, 'until': updated_time < self.until}

    def dump_file(self, file, info_dict, tag, encode_type):
        with open(os.path.join(self.base_path, file), 'w', encoding=encode_type) as of:
            writer = csv.DictWriter(of, tag, lineterminator='\n')
            writer.writeheader()
            for key in info_dict:
                writer.writerow(info_dict[key])

    def craete_xbrl_url_json_each_symbols(self, list_symbols):
        print(
            f'since:{ self.since } ,until:{ self.until } ({ self.file_info_str })')
        i, result_dict = 0, {}
        for t_symbol in tqdm(list_symbols):
            response_string = self.get_link_info_str(f'query/{t_symbol}')
            ET_tree = self.parse_xml(response_string)
            info_dict = self.get_link(ET_tree)
            if len(info_dict) > 0:
                for key in info_dict:
                    result_dict[key] = info_dict[key]
                self.dump_file(
                    f'dat_download{ self.file_info_str }.csv', result_dict, self.csv_tag, self.encode_type)
            time.sleep(self.wait_time)
            i += 1
        print('complete a download!!')

    def craete_xbrl_url_json_from_latest(self):
        print(
            f'since:{ self.since } ,until:{ self.until } ({ self.file_info_str })')
        i, result_dict = 0, {}
        while True:
            page = 1 + i
            print(f'page{page}, loading...')
            response_string = self.get_link_info_str(page)
            ET_tree = self.parse_xml(response_string)
            info_dict = self.get_link(ET_tree)
            if len(info_dict) > 0:
                for key in info_dict:
                    result_dict[key] = info_dict[key]
                self.dump_file(
                    f'dat_download{ self.file_info_str }.csv', result_dict, self.csv_tag, self.encode_type)
            time.sleep(self.wait_time)
            if self.out_of_since:
                break
            i += 1
        print('complete a download!!')


In [0]:
from datetime import datetime

since = datetime.strptime('2020-04-01' ,'%Y-%m-%d')
until = datetime.strptime('2020-04-26' ,'%Y-%m-%d')
yuho = yuho_catcher( since ,until )
yuho.craete_xbrl_url_json_from_latest()
# yuho.craete_xbrl_url_json_each_symbols( [7203] )

since:2020-04-01 00:00:00 ,until:2020-04-26 00:00:00 (_200401_200426)
page1, loading...
page2, loading...
page3, loading...
page4, loading...
page5, loading...
page6, loading...
page7, loading...
page8, loading...
page9, loading...
page10, loading...
complete a download!!
