In [42]:
# definitions for pre-processing the data
# global constants --------------------------------------------------------------------------------
g_data_path = "IVE_tickbidask.txt"
g_interesting_years = [2019, 2020]
g_interesting_window_start = datetime.time(14, 30, 0)
g_interesting_window_end = datetime.time(15, 0, 0)
g_normalize = True;
# -------------------------------------------------------------------------------------------------

def convert_time(string_time):
    # given a time in string format, output a time in datetime.time format
    format = "%H:%M:%S"
    output = datetime.datetime.strptime(string_time, format)
    return output.time()

def convert_date(string_date):
    # given a date in string format, output a time in datetime.date format
    format = "%m/%d/%Y"
    output = datetime.datetime.strptime(string_date, format)
    return output.date()

def calculate_volatility(df, normalize):
    # given a df for a time period, calculate the VWAP for this time period
    volume_total = df["Volume"].sum()
    volume_times_price_total = (df["Volume"] * df["Price"]).sum()
    vwap = volume_times_price_total / volume_total
    last_valid_index = df["Price"].last_valid_index()
    final_price = df["Price"][last_valid_index]
    volatility = abs(final_price - vwap)
    if normalize == True:
        volatility = volatility / final_price * 100
    return volatility

def gen_features_and_targets(df, normalize):
    # given a df, translate it into features and targets to be used by the neural net
    grouped_by_date = df.groupby(df['Date'])
    features = []
    targets = []
    for date in grouped_by_date.groups.keys():
        cur_df = grouped_by_date.get_group(date)
        cur_grouped = cur_df.groupby(cur_df["Minute"])
        cur_features = []
        for minute in range(30, 60):
            try:
                cur_minute = cur_grouped.get_group(minute)
            except:
                if minute == 59:
                    targets.append(0)
                else:
                    cur_features.append(0)
                continue
            volatility = calculate_volatility(cur_minute, normalize)
            if minute == 59:
                targets.append(volatility)
            else:
                cur_features.append(volatility)
        features.append(cur_features)
    return np.array(features), np.array(targets)

def load_data_main(data_path=g_data_path, 
                   interesting_years=g_interesting_years, 
                   interesting_window_start=g_interesting_window_start,
                   interesting_window_end=g_interesting_window_end, 
                   normalize=g_normalize):
    # data_path: path to the local raw data (e.g. "IVE_tickbidask.txt")
    # interesting_years: a list that contains all the years that we are interested in
    # interesting_window_start: the start time of the time window that we are interested in, in datetime format
    # interesting_window_end: the end time of the time window that we are interested in, in datetime format
    # normalize: determines if we should normalize the volatility
    
    # print the status
    print("generating features and targets...")
    print("data_path = " + data_path)
    print("interesting_years = " + str(interesting_years))
    print("interesting_window_start = " + str(interesting_window_start))
    print("interesting_window_end = " + str(interesting_window_end))
    print("normalize = " + str(normalize))
    
    # load the raw dataset
    df = pd.read_csv(data_path)
    
    # find interesting years
    df['Date'] = df['Date'].apply(lambda x: convert_date(x)) # reformat date
    df['Year'] = df['Date'].apply(lambda x: x.year) # isolate the year attribute
    df = df[df['Year'].isin(interesting_years)] # keep data from interesting years only
    
    # find interesting times
    df['Time'] = df['Time'].apply(lambda x: convert_time(x)) # reformat time
    df = df.loc[(df['Time'] >= interesting_window_start) & (df["Time"] < interesting_window_end)] # keep data from interesting time windows only
    df['Minute'] = df['Time'].apply(lambda x: x.minute)
    
    # generate features and targets
    features, targets = gen_features_and_targets(df, normalize)
    return features, targets

In [46]:
import torch
from torch.utils.data import TensorDataset
g_features, g_targets = load_data_main()
g_tensor_features = torch.Tensor(features)
g_tensor_targets = torch.Tensor(targets)
g_tensor_dataset = TensorDataset(g_tensor_features, g_tensor_targets)

generating features and targets...
data_path = IVE_tickbidask.txt
interesting_years = [2019, 2020]
interesting_window_start = 14:30:00
interesting_window_end = 15:00:00
normalize = True
