In [1]:
import datetime
import pandas as pd

target_contracts = ['TX', 'MTX']

In [2]:
dealt_url_template = 'https://www.taifex.com.tw/file/taifex/Dailydownload/DailydownloadCSV/{}'
dealt_filename_template = 'Daily_{}.zip'

date = datetime.date(2021, 12, 15)
s3_bucket = 'indextracker'

dealt_filename = dealt_filename_template.format(date.strftime('%Y_%m_%d'))

dealt_s3_key = f'tw/futures/raw/{dealt_filename}'
dealt_url = dealt_url_template.format(dealt_filename)

dealt_url

'https://www.taifex.com.tw/file/taifex/Dailydownload/DailydownloadCSV/Daily_2021_12_15.zip'

In [3]:
def get_zipped_objects_from_s3(bucket, key):
    import boto3, zipfile, io
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=key)
    with io.BytesIO(obj['Body'].read()) as f:
        with zipfile.ZipFile(f) as zf:
            for file in zf.namelist():
                yield zf.read(file)

In [4]:
dealt_content = list(get_zipped_objects_from_s3(s3_bucket, dealt_s3_key))[0]
dealt_arr = dealt_content.decode('big5').split('\r\n')

dealt_arr[0]

'成交日期,商品代號,到期月份(週別),成交時間,成交價格,成交數量(B+S),近月價格,遠月價格,開盤集合競價 '

In [5]:
dealt_arr_2d = [x.split(',') for x in dealt_arr[1:]]

dealt_df = pd.DataFrame(dealt_arr_2d)
dealt_df.columns = ['date', 'contract', 'expire', 'time', 'price', 'volume', 'near_price', 'far_price', 'is_open_auction']
dealt_df

Unnamed: 0,date,contract,expire,time,price,volume,near_price,far_price,is_open_auction
0,20211214,BRF,202202,204043,2072,30,-,-,
1,20211214,BRF,202202,211303,2066,30,-,-,
2,20211214,BRF,202202,212753,2049.5,2,-,-,
3,20211214,BRF,202202,215828,2036.5,2,-,-,
4,20211214,BRF,202202,235258,2036.5,30,-,-,
...,...,...,...,...,...,...,...,...,...
398552,20211215,ZFF,202201,133248,1666,2,-,-,
398553,20211215,ZFF,202201,134141,1667,2,-,-,
398554,20211215,ZFF,202201,134439,1667.4,2,-,-,
398555,20211215,ZFF,202201,134458,1667.4,2,-,-,


In [6]:
dealt_df['contract'] = dealt_df.contract.str.strip()
dealt_df = dealt_df[dealt_df['contract'].isin(target_contracts)]

In [7]:
dealt_df = dealt_df.drop('is_open_auction', axis=1)
dealt_df

Unnamed: 0,date,contract,expire,time,price,volume,near_price,far_price
98146,20211214,MTX,202112,150000,17590,110,-,-
98147,20211214,MTX,202112,150000,17590,2,-,-
98148,20211214,MTX,202112,150000,17590,2,-,-
98149,20211214,MTX,202112,150000,17589,2,-,-
98150,20211214,MTX,202112,150000,17588,2,-,-
...,...,...,...,...,...,...,...,...
389871,20211215,TX,202206,133050,17386,2,-,-
389872,20211214,TX,202209,174340,16910,2,-,-
389873,20211214,TX,202209,174340,16910,2,-,-
389874,20211214,TX,202209,212115,16900,2,-,-


In [8]:
spread_url_template = 'https://www.taifex.com.tw/file/taifex/Dailydownload/DailydownloadCSV_C/{}'
spread_filename_template = 'Daily_{}_C.zip'

spread_filename = spread_filename_template.format(date.strftime('%Y_%m_%d'))

spread_s3_key = f'tw/futures/raw/{spread_filename}'
spread_url = spread_url_template.format(spread_filename)

spread_url

'https://www.taifex.com.tw/file/taifex/Dailydownload/DailydownloadCSV_C/Daily_2021_12_15_C.zip'

In [9]:
spread_content = list(get_zipped_objects_from_s3(s3_bucket, spread_s3_key))[0]
spread_arr = spread_content.decode('big5').split('\r\n')

spread_arr[0]

'成交日期,商品代號,到期月份(週別),成交時間,成交價格,成交數量(B+S),近月價格,遠月價格,屬價差對價差成交者 '

In [10]:
spread_arr_2d = [x.split(',') for x in spread_arr[1:]]

spread_df = pd.DataFrame(spread_arr_2d)
spread_df.columns = ['date', 'contract', 'expire', 'time', 'price', 'volume', 'near_price', 'far_price', 'is_with_spread']
spread_df

Unnamed: 0,date,contract,expire,time,price,volume,near_price,far_price,is_with_spread
0,20211215,BTF,202112/202201,110022,0,112,4545,4545,*
1,20211215,BTF,202112/202201,110038,0,112,4545,4545,*
2,20211215,CAF,202112/202201,091031,-.2,4,83.1,82.9,
3,20211215,CAF,202112/202201,091640,-.17,8,83.1,82.93,*
4,20211215,CAF,202112/202201,094459,-.1,4,82.9,82.8,
...,...,...,...,...,...,...,...,...,...
47143,20211215,ZFF,202112/202201,121507,-2,8,1671.8,1669.8,
47144,20211215,ZFF,202112/202201,130017,-1.8,4,1671,1669.2,
47145,20211215,ZFF,202112/202201,130017,-1.8,4,1671,1669.2,
47146,20211215,ZFF,202201/202202,131329,-1,4,1666,1665,*


In [11]:
spread_df.contract = spread_df.contract.str.strip()
spread_df = spread_df.loc[spread_df.contract.isin(target_contracts)]

In [12]:
spread_df = spread_df.drop('is_with_spread', axis=1)
spread_df

Unnamed: 0,date,contract,expire,time,price,volume,near_price,far_price
25464,20211215,MTX,202112/202112W4,093016,-22,8,17551,17529
25465,20211215,MTX,202112/202112W4,101207,-23,4,17626,17603
25466,20211215,MTX,202112/202112W4,101306,-23,4,17625,17602
25467,20211215,MTX,202112/202112W4,101306,-23,4,17625,17602
25468,20211215,MTX,202112/202112W4,101306,-23,4,17625,17602
...,...,...,...,...,...,...,...,...
46753,20211215,TX,202201/202209,094707,-597,4,17557,16960
46754,20211214,TX,202202/202203,220542,-56,4,17439,17383
46755,20211215,TX,202202/202203,100127,-60,4,17555,17495
46756,20211214,TX,202206/202209,212115,-396,4,17296,16900


In [13]:
total_df = pd.concat([dealt_df, spread_df])

In [14]:
total_df.expire.unique()

array(['202112     ', '202112/202201', '202112/202206', '202112/202209',
       '202112/202203', '202112W4     ', '202201     ', '202202     ',
       '202203     ', '202206     ', '202209     ', '202201/202202',
       '202201/202206', '202206/202209', '202112/202112W4',
       '202112/202202', '202112W4/202201', '202201/202203',
       '202201/202209', '202202/202203'], dtype=object)

In [15]:
total_df['expire'] = total_df['expire'].str.strip()
total_df['volume'] = pd.to_numeric(total_df['volume'])
total_df['datetime'] = pd.to_datetime(total_df.date.str.strip() + total_df.time.str.strip(), format='%Y%m%d%H%M%S')

In [16]:
switch_df = total_df[total_df['expire'].str.contains('/')]

switch_df_near = switch_df.copy()
switch_df_near.expire = switch_df_near.expire.str.extract(r'(\d+)/\d+')
switch_df_near.price = switch_df_near.near_price
switch_df_near.volume = switch_df_near.volume // 2

switch_df_far = switch_df.copy()
switch_df_far.expire = switch_df_far.expire.str.extract(r'\d+/(\d+)')
switch_df_far.price = switch_df_far.far_price
switch_df_far.volume = switch_df_far.volume // 2

total_df = total_df.drop(switch_df.index)
total_df = pd.concat([total_df, switch_df_near, switch_df_far])

In [17]:
total_df = total_df[['datetime', 'contract', 'expire', 'price', 'volume']]

total_df

Unnamed: 0,datetime,contract,expire,price,volume
98146,2021-12-14 15:00:00,MTX,202112,17590,110
98147,2021-12-14 15:00:00,MTX,202112,17590,2
98148,2021-12-14 15:00:00,MTX,202112,17590,2
98149,2021-12-14 15:00:00,MTX,202112,17589,2
98150,2021-12-14 15:00:00,MTX,202112,17588,2
...,...,...,...,...,...
46753,2021-12-15 09:47:07,TX,202209,16960,2
46754,2021-12-14 22:05:42,TX,202203,17383,2
46755,2021-12-15 10:01:27,TX,202203,17495,2
46756,2021-12-14 21:21:15,TX,202209,16900,2
