In [122]:
# definitions
import pandas as pd
import datetime 
import numpy as np

def convert_time(string_time):
    # given a time in string format, output a time in datetime.time format
    format = "%H:%M:%S"
    output = datetime.datetime.strptime(string_time, format)
    return output.time()

def convert_date(string_date):
    # given a date in string format, output a time in datetime.date format
    format = "%m/%d/%Y"
    output = datetime.datetime.strptime(string_date, format)
    return output.date()

def calculate_volatility(df):
    # given a df for a time period, calculate the VWAP for this time period
    volume_total = df["Volume"].sum()
    volume_times_price_total = (df["Volume"] * df["Price"]).sum()
    vwap = volume_times_price_total / volume_total
    last_valid_index = df["Price"].last_valid_index()
    final_price = df["Price"][last_valid_index]
    volatility = abs(final_price - vwap)
    return volatility

def gen_features_and_targets(df):
    grouped_by_date = df.groupby(df['Date'])
    features = []
    targets = []
    for date in grouped_by_date.groups.keys():
        cur_df = grouped_by_date.get_group(date)
        cur_grouped = cur_df.groupby(cur_df["Minute"])
        cur_features = []
        for minute in cur_grouped.groups.keys():
            cur_minute = cur_grouped.get_group(minute)
            volatility = calculate_volatility(cur_minute)
            if minute == 59:
                targets.append(volatility)
            else:
                cur_features.append(volatility)
        features.append(cur_features)
    return np.array(features), np.array(targets)

def main():
    df = pd.read_csv("2019.txt")
    df['Time'] = df['Time'].apply(lambda x: convert_time(x))
    df = df.loc[(df['Time'] >= datetime.time(14, 30, 0)) & (df["Time"] < datetime.time(15, 0, 0))]
    df['Date'] = df['Date'].apply(lambda x: convert_date(x))
    df['Minute'] = df['Time'].apply(lambda x: x.minute)
    features, targets = gen_features_and_targets(df)
    return features, targets

In [123]:
features, targets = main()

  return np.array(features), np.array(targets)


In [124]:
len(features)

249

In [125]:
len(targets)

221