# Craw Stocks

In [2]:
import requests, re, time, os, warnings
import datetime, utils
import pandas as pd
import numpy as np

warnings.filterwarnings('ignore')

In [3]:
# 先定義方法將下載下的格式轉成我們想要的DataFrame格式
def to_pd(date, text):
    df = None
    try:
        # 用換行符號將文字檔分隔, 並過濾掉不符我們預想的內容
        lines = [line for line in text.split('\n') if len(line.split('",'))==17 and line[0]!='=']
        # 第一行作為columns, 只拿出其中的中文字, 然後用columns製作空的DataFrame
        columns = [re.search('[\u4e00-\u9fff]*', i.replace('\"', '')).group() for i in lines[0].split('","')]
        df = pd.DataFrame(columns=columns)
        
        # 將除第一行之外的一個一個塞到DataFrame中, 並把內容我們不要的符號replace掉
        for idx, line in enumerate(lines):
            if idx==0 :
                continue
            df.loc[idx-1, :] = [i.replace('\"',"").replace(',\r','') for i in line.split('","')]
        
        # 最後調整一下DateFrame的格式後回傳
        df = df.set_index('證券代號')
        df.loc[:,['成交金額', '成交股數', '成交筆數']] = df.loc[:,['成交金額', '成交股數', '成交筆數']].replace(',', '', regex=True)
        print(date, 'success.')
    except IndexError as e:
        print(date, e)
        
    return df

In [4]:
def crawl_price(date):
    # 把header先設定好, 因為用get取資料比較快, 就不使用字典, 直接放在list中, 可以用sparse語法塞到url中
    data = ['response', 'csv', 'date', date.replace('-',''), 'type', 'ALL']
    url = 'http://www.twse.com.tw/exchangeReport/MI_INDEX?{}={}&{}={}&{}={}'.format(*data)
    
    # 用get方法給出請求拿資料, 取出文字後丟方法轉成dataframe, 最後回傳
    with requests.get(url) as r:
        text = r.text
        df = to_pd(date, text)
        return df

In [5]:
# 定義一個產生器, 這種方法可以確保我們中斷還是能順利把資料產出
def create_crawl(n_days, date=None, data=None):
    # 參數設定為None, 並方法內判斷, 這能夠讓我們有較大的彈性決定要不要傳入參數
    if not data:
        data = {}
    if not date:
        date = datetime.datetime.now()
    
    # 設定迴圈呼叫前面定義的方法, 並每次執行完就把產出直接回傳出去
    while n_days > 0:
        date_string = date.isoformat()[:10]
        df = crawl_price(date_string)
        date -= datetime.timedelta(days=1)
        n_days -= 1
        yield date, date_string, df

In [24]:
# 最後再寫一個方法來執行方法, 主要是用來捕捉錯誤以及設定暫停
def start_craw(n_days, date=None):
    # 特別注意參數中並沒有設定data, 這代表我們並沒有將data傳進方法中, 同時我們也沒有回傳data
    # 其實python在方法中還是會自動調用global的物件, 透過這種寫法我們可以在下面的迴圈中不斷的更新data
    # 即使發生了不可預期的錯誤, 只要python環境還沒有結束, data就可以持續留在記憶體中, 且不需要等到整個方法結束才能回傳
    if not date:
        date = datetime.datetime.now()
    
    # 建立初始的產生器物件, 並設定每次迴圈的動作
    generator = create_crawl(n_days=n_days, date=date, data=data)
    while True:
        # 建議由暫停起手, 確保每次都會暫停, 有效防止對方主機檔IP
        time.sleep(5)
        # try我們想要執行的內容, 呼叫產生器, 然後存到字典中
        try:
            new_date, date_string, df = generator.send(None)
            data[date_string] = df
        # 由於這邊對方主機有時候會拒絕連線, 因此要寫我們從被拒絕的地方繼續嘗試
        except requests.adapters.ConnectionError as e:
            print(e)
            generator = create_crawl(n_days=n_days-(date - new_date).days, date=new_date, data=data)
        # 產生器物件迭代完畢會拋出StopIteration錯誤, 捕捉起來告訴自己迭代完畢即可
        except StopIteration as e:
            print('Total Finished.')
            break


In [27]:
# 這個寫法可以特別記一下, 主要是建立py檔時使用, 當該程式是主要執行程式時執行
if __name__ == '__main__':
    # 建立空字典
    data={}
    # 執行爬蟲動作
    start_craw(365, date = datetime.datetime.now())
    # 將物件用二進位的方式存起來, 可以看一下utils內語法, 預設使用最高壓縮協定, 能夠減少一半以上空間占用
    utils.save_obj(data, os.path.join('data', 'stock.pkl'))

2018-09-15 list index out of range
2018-09-14 success.
2018-09-13 success.
2018-09-12 success.
2018-09-11 success.
2018-09-10 success.
2018-09-09 list index out of range
2018-09-08 list index out of range
2018-09-07 success.
2018-09-06 success.
2018-09-05 success.
2018-09-04 success.
2018-09-03 success.
2018-09-02 list index out of range
2018-09-01 list index out of range
2018-08-31 success.
2018-08-30 success.
2018-08-29 success.
2018-08-28 success.
2018-08-27 success.
2018-08-26 list index out of range
2018-08-25 list index out of range
2018-08-24 success.
2018-08-23 success.
2018-08-22 success.
2018-08-21 success.
2018-08-20 success.
2018-08-19 list index out of range
2018-08-18 list index out of range
2018-08-17 success.
2018-08-16 success.
2018-08-15 success.
2018-08-14 success.
2018-08-13 success.
2018-08-12 list index out of range
2018-08-11 list index out of range
2018-08-10 success.
2018-08-09 success.
2018-08-08 success.
2018-08-07 success.
2018-08-06 success.
2018-08-05 list

2017-12-24 list index out of range
2017-12-23 list index out of range
2017-12-22 success.
2017-12-21 success.
2017-12-20 success.
2017-12-19 success.
2017-12-18 success.
2017-12-17 list index out of range
2017-12-16 list index out of range
2017-12-15 success.
2017-12-14 success.
2017-12-13 success.
2017-12-12 success.
2017-12-11 success.
2017-12-10 list index out of range
2017-12-09 list index out of range
2017-12-08 success.
2017-12-07 success.
2017-12-06 success.
2017-12-05 success.
2017-12-04 success.
2017-12-03 list index out of range
2017-12-02 list index out of range
2017-12-01 success.
2017-11-30 success.
2017-11-29 success.
2017-11-28 success.
2017-11-27 success.
2017-11-26 list index out of range
2017-11-25 list index out of range
2017-11-24 success.
2017-11-23 success.
2017-11-22 success.
2017-11-21 success.
2017-11-20 success.
2017-11-19 list index out of range
2017-11-18 list index out of range
2017-11-17 success.
2017-11-16 success.
2017-11-15 success.
2017-11-14 success.


In [29]:
# 偷看一下我們抓下來的資料, 裡面是每一天存一個dataframe, 透過這個結構我們可以拿出任何我們想要的訊息
data['2018-09-14']

Unnamed: 0_level_0,證券名稱,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌,漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比
證券代號,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1101,台泥,13834200,6222,545879305,39.10,39.80,38.55,39.65,+,0.55,39.65,1,39.70,39,11.49
1102,亞泥,23391527,10885,884179826,37.25,38.40,37.05,37.80,+,0.80,37.80,19,37.85,54,12.90
1103,嘉泥,406983,149,5457518,13.35,13.50,13.30,13.35,-,0.05,13.35,49,13.40,20,6.15
1104,環泥,114070,79,2324025,20.30,20.45,20.30,20.40,+,0.10,20.40,1,20.45,12,11.66
1108,幸福,174202,29,1359995,7.84,7.84,7.78,7.78,-,0.04,7.78,32,7.84,5,0.00
1109,信大,179200,68,2678339,15.00,15.00,14.90,14.95,-,0.05,14.90,14,14.95,1,9.29
1110,東泥,70000,47,968350,14.00,14.00,13.75,13.95,-,0.05,13.85,1,13.95,9,63.41
1201,味全,633722,350,14170657,22.40,22.50,22.30,22.40,+,0.05,22.40,8,22.45,14,16.72
1203,味王,13445,18,343701,25.60,25.65,25.40,25.40,-,0.20,25.35,1,25.50,1,13.58
1210,大成,1585570,1120,58574967,36.85,37.10,36.70,37.00,+,0.30,37.00,3,37.05,43,10.22
