# Craw Stocks

In [3]:
import requests, re, time, os, warnings, utils
import datetime
import pandas as pd
import numpy as np

warnings.filterwarnings('ignore')

In [90]:
# 先定義方法將下載下的格式轉成我們想要的DataFrame格式
def to_pd(date, text):
    try:
        # 建立DataFrame, 用fields5的欄位
        df = pd.DataFrame(columns=text['fields5'][1:])
        
        # 將股票選出來存進DataFrame中
        for file in text['data5']:
            if re.match('^[0-9]{4}$', file[0]):
                df.loc[file[0]] = file[1:]
        
        # 調整內容的格式
        df['漲跌(+/-)'] = df['漲跌(+/-)'].apply(lambda x: re.search('[+-]', x).group() if re.search('[+-]', x) else np.nan)
        df.loc[:,['成交金額', '成交股數', '成交筆數']] = df.loc[:,['成交金額', '成交股數', '成交筆數']].replace(',', '', regex=True)
        
        print(date, 'success.')
        return df
    except KeyError as e:
        print(date, 'no data this day')

In [83]:
def crawl_price(date):
    # 把header先設定好, 因為用get取資料比較快, 就不使用字典, 直接放在list中, 可以用sparse語法塞到url中
    data = ['response', 'json', 'date', date.replace('-',''), 'type', 'ALL']
    url = 'http://www.twse.com.tw/exchangeReport/MI_INDEX?{}={}&{}={}&{}={}'.format(*data)
    
    # 用get方法給出請求拿資料, 取出文字後丟方法轉成dataframe, 最後回傳
    with requests.get(url) as r:
        text = r.json()
        df = to_pd(date, text)
        return df

In [6]:
def create_crawl(n_days, date=None, data=None):
    # 參數設定為None, 並方法內判斷, 這能夠讓我們有較大的彈性決定要不要傳入參數
    if not data:
        data = {}
    if not date:
        date = datetime.datetime.now()
    
    # 設定迴圈呼叫前面定義的方法, 並每次執行完就把產出直接回傳出去
    while n_days > 0:
        date_string = date.isoformat()[:10]
        df = crawl_price(date_string)
        date -= datetime.timedelta(days=1)
        n_days -= 1
        yield date, date_string, df

In [7]:
def start_craw(n_days, date=None):
    # 特別注意參數中並沒有設定data, 這代表我們並沒有將data傳進方法中, 同時我們也沒有回傳data
    # 其實python在方法中還是會自動調用global的物件, 透過這種寫法我們可以在下面的迴圈中不斷的更新data
    # 即使發生了不可預期的錯誤, 只要python環境還沒有結束, data就可以持續留在記憶體中, 且不需要等到整個方法結束才能回傳
    if not date:
        date = datetime.datetime.now()
    new_date = date
    # 建立初始的產生器物件, 並設定每次迴圈的動作
    generator = create_crawl(n_days=n_days, date=date, data=data)
    while True:
        # 建議由暫停起手, 確保每次都會暫停, 有效防止對方主機檔IP
        time.sleep(5)
        # 資料從這時提供, 中斷掉
        if new_date<=datetime.datetime(2004, 2, 11):
            print('Total Finished.')
            break
        # try我們想要執行的內容, 呼叫產生器, 然後存到字典中
        try:
            new_date, date_string, df = generator.send(None)
            data[date_string] = df
        # 由於這邊對方主機有時候會拒絕連線, 因此要寫我們從被拒絕的地方繼續嘗試
        except requests.adapters.ConnectionError as e:
            print(e)
            generator = create_crawl(n_days=n_days-(date - new_date).days, date=new_date, data=data)
        # 產生器物件迭代完畢會拋出StopIteration錯誤, 捕捉起來告訴自己迭代完畢即可
        except StopIteration as e:
            print('Total Finished.')
            break

In [91]:
now = time.time()
data={}
# 執行爬蟲動作
start_craw(10, date = datetime.datetime.now())
# 將物件用二進位的方式存起來, 可以看一下utils內語法, 預設使用最高壓縮協定, 能夠減少一半以上空間占用
utils.save_obj(data, os.path.join('data', 'stock.pkl'))
print(time.time() - now)

2018-09-17 success.
2018-09-16 no data this day
2018-09-15 no data this day
2018-09-14 success.
2018-09-13 success.
2018-09-12 success.
2018-09-11 success.
2018-09-10 success.
2018-09-09 no data this day
2018-09-08 no data this day
Total Finished.
111.17400002479553


In [92]:
data['2018-09-17']

Unnamed: 0,證券名稱,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌(+/-),漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比
0050,元大台灣50,8988960,2440,773023506,86.50,86.50,85.70,85.85,-,0.65,85.80,98,85.85,220,0.00
0051,元大中型100,31000,11,1015780,32.80,32.81,32.72,32.72,-,0.20,32.72,1,32.88,1,0.00
0052,富邦科技,102000,15,5649350,55.75,55.75,55.35,55.40,-,0.35,55.20,2,55.40,1,0.00
0053,元大電子,11000,11,402000,36.72,36.72,36.40,36.48,-,0.04,36.40,1,36.55,1,0.00
0054,元大台商50,2000,2,47300,23.65,23.65,23.65,23.65,+,0.09,23.61,20,23.69,1,0.00
0055,元大MSCI金融,63000,19,1148330,18.34,18.34,18.20,18.24,+,0.08,18.20,1,18.25,4,0.00
0056,元大高股息,5451108,1166,149743585,27.43,27.55,27.29,27.46,+,0.06,27.45,1,27.46,176,0.00
0057,富邦摩台,20000,3,1039000,51.95,51.95,51.95,51.95,-,0.25,51.85,1,52.05,20,0.00
0058,富邦發達,21000,5,1007800,48.20,48.20,47.98,47.98,-,0.02,47.90,1,48.09,5,0.00
0059,富邦金融,0,0,0,--,--,--,--,,0.00,44.03,10,44.18,2,0.00
