In [1]:
import datetime
import pandas as pd

In [2]:
url_template = 'https://www.taifex.com.tw/file/taifex/Dailydownload/OptionsDailydownloadCSV/{}'
filename_template = 'OptionsDaily_{}.zip'

date = datetime.date(2021, 12, 15)
s3_bucket = 'indextracker'
filename = filename_template.format(date.strftime('%Y_%m_%d'))

s3_key = f'tw/options/raw/{filename}'
url = url_template.format(filename)

url

'https://www.taifex.com.tw/file/taifex/Dailydownload/OptionsDailydownloadCSV/OptionsDaily_2021_12_15.zip'

In [3]:
def get_zipped_objects_from_s3(bucket, key):
    import boto3, zipfile, io
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=key)
    with io.BytesIO(obj['Body'].read()) as f:
        with zipfile.ZipFile(f) as zf:
            for file in zf.namelist():
                yield zf.read(file)

In [4]:
content = list(get_zipped_objects_from_s3(s3_bucket, s3_key))[0]
arr = content.decode('big5').split('\r\n')

arr[0]

' 成交日期,          商品代號,        履約價格,                                                      到期月份(週別),        買賣權別,      成交時間,          成交價格,         成交數量(B or S),     開盤集合競價 '

In [5]:
arr_2d = [x.split(',') for x in arr[1:]]

df = pd.DataFrame(arr_2d)
df.columns = ['date', 'contract', 'strike_price', 'expiration', 'type', 'time', 'price', 'volume', 'is_open_auction']
df

Unnamed: 0,date,contract,strike_price,expiration,type,time,price,volume,is_open_auction
0,---------- ---- ------- ---- -----------------...,,,,,,,,
1,20211215,CCO,55 ...,202112,P,095303,.01,1,
2,20211215,CCO,55 ...,202112,P,095303,.01,1,
3,20211215,CCO,60 ...,202201,C,125829,3.69,1,
4,20211215,CCO,60 ...,202201,C,125829,3.69,1,
...,...,...,...,...,...,...,...,...,...
626684,20211215,TXO,21800 ...,202203,C,134125,4,1,
626685,20211215,TXO,21800 ...,202203,C,134125,4,1,
626686,20211215,TXO,21800 ...,202203,C,134201,3.4,3,
626687,20211215,TXO,21800 ...,202203,C,134201,3.4,3,


In [6]:
df = df.dropna()
df.contract = df.contract.str.strip()
df.contract.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.contract = df.contract.str.strip()


array(['CCO', 'CDO', 'CEO', 'CHO', 'CKO', 'CLO', 'CMO', 'DCO', 'DEO',
       'DHO', 'DLO', 'DQO', 'DVO', 'NYO', 'NZO', 'OAO', 'OBO', 'OCO',
       'OJO', 'OKO', 'OOO', 'TEO', 'TFO', 'TGO', 'TXO'], dtype=object)

In [7]:
df = df[df.contract == 'TXO']

df

Unnamed: 0,date,contract,strike_price,expiration,type,time,price,volume,is_open_auction
344,20211214,TXO,13000 ...,202203,P,213500,40,1,
345,20211214,TXO,13000 ...,202203,P,213500,40,1,
346,20211214,TXO,13000 ...,202203,P,225319,37,1,
347,20211214,TXO,13000 ...,202203,P,225319,37,1,
348,20211215,TXO,13000 ...,202203,P,090439,37.5,1,
...,...,...,...,...,...,...,...,...,...
626683,20211215,TXO,21800 ...,202203,C,134118,4,2,
626684,20211215,TXO,21800 ...,202203,C,134125,4,1,
626685,20211215,TXO,21800 ...,202203,C,134125,4,1,
626686,20211215,TXO,21800 ...,202203,C,134201,3.4,3,


In [8]:
df.dtypes

date               object
contract           object
strike_price       object
expiration         object
type               object
time               object
price              object
volume             object
is_open_auction    object
dtype: object

In [9]:
df['datetime'] = pd.to_datetime(df.date.str.strip() + df.time.str.strip(), format='%Y%m%d%H%M%S')

In [10]:
df = df[['datetime', 'contract', 'expiration', 'strike_price', 'type', 'price', 'volume']]
df

Unnamed: 0,datetime,contract,strike_price,expiration,type,price,volume
344,2021-12-14 21:35:00,TXO,13000 ...,202203,P,40,1
345,2021-12-14 21:35:00,TXO,13000 ...,202203,P,40,1
346,2021-12-14 22:53:19,TXO,13000 ...,202203,P,37,1
347,2021-12-14 22:53:19,TXO,13000 ...,202203,P,37,1
348,2021-12-15 09:04:39,TXO,13000 ...,202203,P,37.5,1
...,...,...,...,...,...,...,...
626683,2021-12-15 13:41:18,TXO,21800 ...,202203,C,4,2
626684,2021-12-15 13:41:25,TXO,21800 ...,202203,C,4,1
626685,2021-12-15 13:41:25,TXO,21800 ...,202203,C,4,1
626686,2021-12-15 13:42:01,TXO,21800 ...,202203,C,3.4,3
