# Craw StatementDog

In [1]:
import re, sys, time, json, requests, os
import pandas as pd
import multiprocessing as mp

In [3]:
# 直接用 pandas 讀上市公司清單
df = pd.read_html('http://www.tej.com.tw/webtej/doc/uid.htm')[1]

In [4]:
# 提取 dataframe 中的內容, 放進字串中並使用空白分隔
tables = ' '.join((df.loc[idx+1, 0] for idx in range(len(df)) if '類股' in df.loc[idx, 0]))
print(tables[:70])

1101 台泥  1102 亞泥  1103 嘉泥  1104 環泥  1107 建台  1109 信大  1110 東泥 1201 味全 


In [5]:
# 只提取股票的代碼, 中文不要
stocks = [stock for stock in tables.split(' ') if re.match('[0-9]', stock)]
print(stocks[:10])

['1101', '1102', '1103', '1104', '1107', '1109', '1110', '1201', '1203', '1204']


In [55]:
# 到財報狗網站中爬取每檔股票的財報指標
now = time.time()

dic = {}
for stock in stocks:
    while True:
        sys.stdout.write(f'\r{stock}.. ')
        sys.stdout.flush()
        url = f'https://statementdog.com/api/v1/fundamentals/{stock}/2013/1/2018/4/cf'
        
        with requests.get(url) as response:
            # 將 json 解析並存到 dic中
            try:
                dic[stock] = response.json()
                break
            # json不能解析會報 JSONDecodeError, 用 ValueError處理
            except ValueError:
                sys.stdout.write('.. ')
                sys.stdout.flush()
                time.sleep(5)
            
print(f'Finished. {time.time() - now}s')

9945.. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. .. Finished. 497.0620002746582s


In [45]:
for stock, data in dic.items():
    sys.stdout.write(f'\r{stock}')
    sys.stdout.flush()
    
    # 拿出該支股票的各期間資料長度
    try:
        time_m, time_q, time_y = [len(d['data']) for _, d in data.items() 
                                  if d['label'] in ['TimeM', 'TimeQ', 'TimeY']]
    except KeyError:
        continue
    
    # 將月, 季, 年分別存到不同字典
    dic[stock]={}
    dic[stock]['data_m'] = {}
    dic[stock]['data_q'] = {}
    dic[stock]['data_y'] = {}
    
    # json 解析完後還是dict物件, 同樣用items()取資料
    for _, d in data.items() :
        if len(d['data']) == time_m :
            dic[stock]['data_m'][d['label']] = d['data']
        elif len(d['data']) == time_q :
            dic[stock]['data_q'][d['label']] = d['data']
        elif len(d['data']) == time_y :
            dic[stock]['data_y'][d['label']] = d['data']
    
    # 年月日分別做成dataframe
    for freq in ['data_m', 'data_q', 'data_y']:
        tmp = pd.DataFrame(
                columns=[key for key in dic[stock][freq].keys()])
        for label, data in dic[stock][freq].items():
            tmp[label] = [d[1] for d in data]
        dic[stock][freq] = tmp

9945

In [50]:
# 稍微看一下結果
dic['1215']['data_m']

Unnamed: 0,TimeM,每月營收,月每股營收,現金股利殖利率,本益比,單月營收年增率,單月每股營收年增率,單月營收月增率,預期盈餘成長率加殖利率,預期盈餘成長率,預期殖利率,3年平均現金股息殖利率,5年平均現金股息殖利率,3年平均現金股息的16倍,3年平均現金股息的20倍,3年平均現金股息的32倍,5年平均現金股息的16倍,5年平均現金股息的20倍,5年平均現金股息的32倍
0,201301,1570413,6.77,1.96,13.95,23.60,23.60,11.32,8.40,1.92,6.33,5.02,5.23,12.27,15.33,24.53,12.8,16.0,25.6
1,201302,1308013,5.64,1.93,14.22,10.69,10.69,-16.71,8.28,1.92,6.21,4.92,5.13,12.27,15.33,24.53,12.8,16.0,25.6
2,201303,1501808,6.47,1.93,14.16,19.63,19.63,14.82,8.31,1.92,6.24,4.94,5.16,12.27,15.33,24.53,12.8,16.0,25.6
3,201304,1428802,6.16,1.98,11.41,26.65,26.65,-4.86,9.18,2.14,6.97,5.05,5.27,12.27,15.33,24.53,12.8,16.0,25.6
4,201305,1333094,5.75,1.92,9.76,16.04,16.04,-6.70,7.94,2.14,6.01,4.90,5.11,12.27,15.33,24.53,12.8,16.0,25.6
5,201306,1441950,6.21,1.91,9.78,20.86,20.86,8.17,7.92,2.14,6.00,4.89,5.10,12.27,15.33,24.53,12.8,16.0,25.6
6,201307,1535031,6.62,5.15,9.68,30.13,30.13,6.46,7.62,2.38,5.70,4.51,4.89,11.2,14.0,22.4,12.16,15.2,24.32
7,201308,1617716,6.97,5.33,10.02,20.36,20.36,5.39,7.05,2.38,4.91,4.66,5.06,11.2,14.0,22.4,12.16,15.2,24.32
8,201309,1667967,7.19,5.39,9.91,24.00,24.00,3.11,7.10,2.38,4.96,4.71,5.12,11.2,14.0,22.4,12.16,15.2,24.32
9,201310,1608977,6.93,5.22,10.22,14.67,14.67,-3.54,6.95,2.46,4.81,4.57,4.96,11.2,14.0,22.4,12.16,15.2,24.32
