In [202]:
from basic_utils import *
from datetime import date
import pandas as pd
import numpy as np

In [203]:
FIG_SIZE = (20,20)

### Transformation functions

In [220]:
# format / clean up functions
def cols_to_date(df, cols):
    for c in cols: 
        if c in df.columns: df[c] = df[c].apply(date_lambda)
def cols_to_bool(df, cols):
    for c in cols: df[c] = df[c].apply(lambda x: pd.to_numeric(x))
def divide_by(df, cols, tgt):
    cols.extend([tgt])
    res = (df[cols].T / df[cols][tgt]).T
    cols.remove(tgt)
    return res[cols]

# lambdas
# date_lambda = lambda x: date.fromtimestamp(x) if x > 0 else 0
date_lambda = lambda x: date.fromtimestamp(x)
pd_datetime = lambda x: pd.to_datetime(x)
datetime_lambda = lambda x: datetime.fromtimestamp(x) if x > 0 else 0
time_delta_to_years = lambda x: x / 365
divide_by_mean = lambda x: x / x.median() #consider z-score
cap_at_1q = lambda x: [max(y, 0) for y in x]
z_score = lambda x: (x - x.mean()) / x.std()

In [205]:
col_dates = ['earningsTimestamp', 'earningsTimestampEnd',
             'earningsTimestampStart', 'postMarketTime', 'regularMarketTime',
             'dividendDate','expiration','lastTradeDate','postMarketTime',
             'storeDate', 'lastTradeDate']

# transforms and utilities
def merge_ds(q_df, o_df):    
    # merge options and quotes
    merged_df = q_df.merge(o_df, how='inner', 
                           left_on=['symbol', 'regularMarketTime'], 
                           right_on=['underlyingSymbol','lastTradeDate'])
    return merged_df

divide_cols = ['lastPrice', 'strike', 'epsForward', 'epsTrailingTwelveMonths',
    'bookValue', 'regularMarketDayHigh',
    'regularMarketDayLow', 'regularMarketOpen', 'regularMarketPreviousClose']
divided_by = 'regularMarketPrice'

scale_cols = ['regularMarketVolume','averageDailyVolume10Day',
              'averageDailyVolume3Month','marketCap']

def transform_ds(df):
    # consider passing common fields as a map (divide by, divide col, etc)
    mod_df = df.copy()
    
    # transformation: divide by closing price (unit values to underlying price)
    mod_df[divide_cols] = divide_by(mod_df, divide_cols, divided_by).round(4)
    mod_df.regularMarketVolume = divide_by(mod_df, 
        ['regularMarketVolume'], 'averageDailyVolume10Day').round(4)

    # transformation: volume (liquidity) and size relative to the universe mean
    scaler_pipe = make_pipeline(StandardScaler())
    mod_df[scale_cols] = scaler_pipe.fit_transform(mod_df[scale_cols])
    
    # transformation: turn dates > days > years    
    mod_df.storeDate = pd.to_datetime(mod_df.storeDate)
    mod_df.expiration = pd.to_datetime(mod_df.expiration)
    delta = (mod_df.expiration - mod_df.storeDate).dt.days
    mod_df.expiration = delta.apply(time_delta_to_years).round(4)
    return mod_df

def filter_ds(df):
    # filter data by expiration, strike, liquidity and active contracts
    oi_mask = df.openInterest > 20
    vol_mask = df.volume > 10
    itm_mask = (df.inTheMoney == False) & (
        ((df.strike > 0.6) & (df.strike < 0.9)) |
        ((df.strike > 1.1) & (df.strike < 1.4))
    )
    exp_mask = (df.expiration > 90/365)
    return df[ oi_mask & vol_mask & itm_mask & exp_mask]

def prepare_for_training(df, cols_to_keep, y_col):    
    num_cols = df[cols_to_keep].select_dtypes(include=['float64'])
    order_cols = [x for x in num_cols if x != y_col]
    order_cols.extend([y_col])
    return df[order_cols]

def visualize_ds(df, bins=20, figsize=FIG_SIZE):
    df.hist(figsize=figsize, bins=bins);    
    
def shuffle_train_test(df):
    df_idx = df.index.values
    df_idx = np.random.randint(0, len(df_idx), size=len(df_idx))
    shuffled_df = df.iloc[df_idx]
    train_df, test_df = train_test_split(shuffled_df, random_state=1)
    return train_df, test_df

### Data load

In [206]:
dates = read_dates('option')
predict_days = -1
predict_dates, train_dates = dates[predict_days:], dates[:predict_days]

In [207]:
%time train_quotes = load_consol_quotes(train_dates)

Loading quotes for 2018-09-07
Loading quotes for 2018-09-10
Loading quotes for 2018-09-11
Loading quotes for 2018-09-12
Loading quotes for 2018-09-13
Loading quotes for 2018-09-14
Loading quotes for 2018-09-17
Loading quotes for 2018-09-18
Loading quotes for 2018-09-19
Loading quotes for 2018-09-20
CPU times: user 400 ms, sys: 321 ms, total: 721 ms
Wall time: 2.62 s


In [208]:
%time train_options = load_consol_options(train_dates)

Loading options for 2018-09-07
Loading options for 2018-09-10
Loading options for 2018-09-11
Loading options for 2018-09-12
Loading options for 2018-09-13
Loading options for 2018-09-14
Loading options for 2018-09-17
Loading options for 2018-09-18
Loading options for 2018-09-19
Loading options for 2018-09-20
CPU times: user 4.75 s, sys: 1.97 s, total: 6.71 s
Wall time: 29.5 s


In [252]:
all_options = train_options.copy()
all_quotes = train_quotes.copy()

### Visualize

In [129]:
all_options.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 620614 entries, 0 to 39535
Data columns (total 18 columns):
ask                  620614 non-null float64
bid                  620614 non-null float64
change               620614 non-null float64
contractSize         620614 non-null object
contractSymbol       620614 non-null object
currency             620540 non-null object
expiration           620614 non-null float64
impliedVolatility    620614 non-null float64
inTheMoney           620614 non-null bool
lastPrice            620614 non-null float64
lastTradeDate        620614 non-null float64
openInterest         620614 non-null float64
percentChange        620614 non-null float64
strike               620614 non-null float64
volume               620614 non-null float64
underlyingSymbol     620614 non-null object
storeDate            620614 non-null float64
type                 620614 non-null object
dtypes: bool(1), float64(12), object(5)
memory usage: 85.8+ MB


In [130]:
idx = random.randint(0, all_options.shape[1]-1)
all_options.iloc[idx][:]

ask                              5.75
bid                              5.35
change                              0
contractSize                  REGULAR
contractSymbol       A190215C00065000
currency                          USD
expiration                1.55019e+09
impliedVolatility            0.255379
inTheMoney                       True
lastPrice                        5.36
lastTradeDate             1.53617e+09
openInterest                      106
percentChange                       0
strike                             65
volume                              4
underlyingSymbol                    A
storeDate                 1.53628e+09
type                             call
Name: 6, dtype: object

In [131]:
date_cols = ['earningsTimestamp', 'earningsTimestampEnd',
             'earningsTimestampStart', 'postMarketTime', 'regularMarketTime',
             'dividendDate','expiration','lastTradeDate','postMarketTime','storeDate']

In [132]:
all_quotes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2235 entries, 0 to 324
Data columns (total 69 columns):
ask                                  2228 non-null float64
askSize                              2228 non-null float64
averageDailyVolume10Day              2232 non-null float64
averageDailyVolume3Month             2232 non-null float64
bid                                  2228 non-null float64
bidSize                              2228 non-null float64
bookValue                            2226 non-null float64
currency                             2230 non-null object
earningsTimestamp                    2136 non-null float64
earningsTimestampEnd                 2052 non-null float64
earningsTimestampStart               2052 non-null float64
epsForward                           2198 non-null float64
epsTrailingTwelveMonths              2226 non-null float64
exchange                             2235 non-null object
exchangeDataDelayedBy                2235 non-null int64
exchangeTimez

In [133]:
idx = random.randint(0, all_quotes.shape[1]-1)
# all_quotes.iloc[idx][:20]
# all_quotes.iloc[idx][20:40]
all_quotes.iloc[idx][40:]

postMarketTime                              1.53635e+09
priceHint                                             2
priceToBook                                     7.50741
quoteSourceName                           Delayed Quote
quoteType                                        EQUITY
region                                               US
regularMarketChange                             0.76001
regularMarketChangePercent                     0.396396
regularMarketDayHigh                             193.82
regularMarketDayLow                             191.175
regularMarketDayRange                  191.175 - 193.82
regularMarketOpen                                191.36
regularMarketPreviousClose                       191.73
regularMarketPrice                               192.49
regularMarketTime                           1.53635e+09
regularMarketVolume                              430680
sharesOutstanding                            7.7067e+07
shortName                            Waters Corp

### Transformations

In [223]:
def zerona_turn_2dates(df, cols):
    df.fillna(0, inplace=True)
    cols_to_date(df, cols)

In [224]:
zerona_turn_2dates(all_options, ['expiration', 'lastTradeDate', 'storeDate'])
zerona_turn_2dates(all_quotes, col_dates)
    
# all_quotes.fillna(0, inplace=True)
# all_options.fillna(0, inplace=True)
# cols_to_date(all_options, col_dates)
# cols_to_date(all_quotes, col_dates)

TypeError: an integer is required (got type datetime.date)

In [182]:
merged_df = merge_ds(all_quotes, all_options)

ValueError: You are trying to merge on float64 and object columns. If you wish to proceed you should use pd.concat

In [96]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 343031 entries, 0 to 343030
Data columns (total 87 columns):
ask_x                                343031 non-null float64
askSize                              343031 non-null float64
averageDailyVolume10Day              343031 non-null float64
averageDailyVolume3Month             343031 non-null float64
bid_x                                343031 non-null float64
bidSize                              343031 non-null float64
bookValue                            343031 non-null float64
currency_x                           343031 non-null object
dividendDate                         343031 non-null object
earningsTimestamp                    343031 non-null object
earningsTimestampEnd                 343031 non-null object
earningsTimestampStart               343031 non-null object
epsForward                           343031 non-null float64
epsTrailingTwelveMonths              343031 non-null float64
exchange                             343