# Data Preparation

In [None]:
import os
import pandas as pd
from datetime import datetime, time, timedelta

In [None]:
path = '/Users/vivekkumar/Documents/MSC_dissertation_project/Drive1/Concatinated/AUDUSD_concatenated.csv.download/AUDUSD_concatenated.csv'
tick_df = pd.read_csv(path)

tick_df['Timestamp'] = pd.to_datetime(tick_df['Timestamp'])  # handle datatypes
tick_df['Midprice'] = (tick_df['Bid'] + tick_df['Ask']) / 2  # add midprice
tick_df.set_index('Timestamp', inplace=True) # set Timestamp as index

tick_df.drop(['Pair', 'Bid', 'Ask'], axis=1, inplace=True)  # remove colums

## Utils

In [None]:
def pct_change(old_value, new_value):
    change = new_value - old_value
    percentage_change = (change / old_value)
    return percentage_change

class Trend(object):
    def __init__(self, direction, DC_start, DCC, OS_end, DC_start_index, DCC_index, OS_end_index, timestamp):
        self.direction, self.DC_start, self.DCC, self.OS_end = direction, DC_start, DCC, OS_end
        self.DC_start_index, self.DCC_index, self.OS_end_index = DC_start_index, DCC_index, OS_end_index
        self.timestamp = timestamp

        self.data_dict = {
                'Direction': self.direction,
                'Start': round(self.DC_start, 6),
                'DCC': round(self.DCC, 6),
                'End': round(self.OS_end, 6),
                'Start Index': round(self.DC_start_index, 6),
                'DCC Index': round(self.DCC_index, 6),
                'End Index': round(self.OS_end_index, 6),
                'DCC Timestamp': timestamp
            }

    def __str__(self):
        return str(self.data_dict)

## Sampling

In [None]:
def sample_windows(tick_df, shift=1, window_size=4):

    # get first and last timestamp of all ticks
    first_date = datetime.combine(tick_df.index[0].date(), time.min)
    last_date = datetime.combine(tick_df.index[-1].date(), time.max)

    # init loop
    timestamps = [first_date]
    next_timestamp = first_date

    # loop while the next timestamp is before the end timestamp
    while next_timestamp < last_date:
        next_timestamp += timedelta(weeks=shift)
        timestamps.append(next_timestamp)

    # generate samples based on if they are within the sliding 4 week ranges
    sampled_dfs = []
    for idx, timestamp in enumerate(timestamps[window_size:]):
        dfs = {}

        # select rows where index is between two dates for train, val and test sets
        week_dates = timestamps[idx: idx+window_size+1]

        dfs['train'] = tick_df.loc[(tick_df.index >= week_dates[0]) & (tick_df.index <= week_dates[2])]
        dfs['validation'] = tick_df.loc[(tick_df.index >= week_dates[2]) & (tick_df.index <= week_dates[3])]
        dfs['test'] = tick_df.loc[(tick_df.index >= week_dates[3]) & (tick_df.index <= week_dates[4])]

        sampled_dfs.append(dfs)

    return sampled_dfs

In [None]:
def profile_data(df, theta):
    # direction: -1 is downturn, 1 is upturn
    starting_price = df['Midprice'][0]
    starting_timestamp = df.index[0]
    # direction, DC_start, DCC, OS_end, DC_start_index, DCC_index, OS_end
    trend_buffer = [1, starting_price, starting_price, starting_price, 0, 0, 0, starting_timestamp]
    trends = []

    # direction, recent DCC, current price
    price_buffer = [1, starting_price, starting_price]
    live_states = []

    # iterate over midprices
    for index, midprice in enumerate(df['Midprice'].values):

        # for upturn
        if trend_buffer[0] == 1:
            # threshold broken
            if pct_change(trend_buffer[3], midprice) < -trend_buffer[0] * theta:
                # log old event
                trends.append(Trend(*trend_buffer))
                # setup new event
                trend_buffer = [-1, trend_buffer[3], midprice, midprice, trend_buffer[6], index, index, df.index[index]]
                price_buffer = [-1, midprice, midprice]
            # new extreme
            elif midprice > trend_buffer[3]:
                trend_buffer[3], trend_buffer[6] = midprice, index

        # for downturn
        elif trend_buffer[0] == -1:
            # threshold broken
            if pct_change(trend_buffer[3], midprice) > -trend_buffer[0] * theta:
                # log old event
                trends.append(Trend(*trend_buffer))
                # setup new event
                trend_buffer = [1, trend_buffer[3], midprice, midprice, trend_buffer[6], index, index, df.index[index]]
                price_buffer = [1, midprice, midprice]
            # new extreme
            elif midprice < trend_buffer[3]:
                trend_buffer[3], trend_buffer[6] = midprice, index

        price_buffer[2] = midprice
        live_states.append(price_buffer.copy())

    return pd.DataFrame([trend.data_dict for trend in trends]), pd.DataFrame(live_states, columns=['Direction', 'DCC', 'Price'])

In [None]:
def format_data(data_dict):
    df = pd.concat(data_dict, axis=1)
    new_columns = []
    for col in df.columns:
        new_columns.append((str(col[0]), col[1] + '_' + str(col[0])))
    df.columns = pd.MultiIndex.from_tuples(new_columns)
    df.columns = df.columns.droplevel(0)
    return df

In [None]:
def generate_data(pair, df, thresholds):

    # get windows
    sampled_dfs = sample_windows(df)[:15]

    windows = {}
    for idx, window in enumerate(sampled_dfs):
        print()
        print(idx)
        set_dict = {}
        for set_name, data_set in window.items():
            print(set_name)

            # sample data
            data_dict = {}
            for theta in thresholds:
                trend_df, live_df = profile_data(df, theta)
                data_dict[theta] = live_df

            formatted_df = format_data(data_dict)
            set_dict[set_name] = formatted_df

            folder_path = f'/Users/vivekkumar/Documents/MSC_dissertation_project/DataTarget/{pair}/Window_{idx}'
            os.makedirs(folder_path, exist_ok=True)
            file_path = os.path.join(folder_path, f'{set_name}.parquet.gzip')
            formatted_df.to_parquet(file_path)

        windows[f'Window_{idx}'] = set_dict

    return windows

In [None]:
pair = 'AUDUSD'
thresholds = [0.00015, 0.00020, 0.00025]

sampled_data = generate_data(pair, tick_df, thresholds)


0
train
validation
test

1
train
validation
test

2
train
validation
test

3
train
validation
test

4
train
validation
test

5
train
validation
test

6
train
validation
test

7
train
validation
test

8
train
validation
test


: 

: 

## Check DataFrame

In [None]:
x_df = pd.read_parquet('/Users/vivekkumar/Documents/MSC_dissertation_project/DataTarget/AUDUSD/Window_0/test.parquet.gzip')

In [None]:
x_df

Unnamed: 0,Direction_0.00015,DCC_0.00015,Price_0.00015,Direction_0.0002,DCC_0.0002,Price_0.0002,Direction_0.00025,DCC_0.00025,Price_0.00025
0,1,0.762090,0.762090,1,0.76209,0.762090,1,0.762090,0.762090
1,1,0.762090,0.762080,1,0.76209,0.762080,1,0.762090,0.762080
2,1,0.762090,0.762075,1,0.76209,0.762075,1,0.762090,0.762075
3,1,0.762090,0.762065,1,0.76209,0.762065,1,0.762090,0.762065
4,1,0.762090,0.762070,1,0.76209,0.762070,1,0.762090,0.762070
...,...,...,...,...,...,...,...,...,...
13039361,-1,0.749755,0.749705,-1,0.74978,0.749705,-1,0.749745,0.749705
13039362,-1,0.749755,0.749715,-1,0.74978,0.749715,-1,0.749745,0.749715
13039363,-1,0.749755,0.749720,-1,0.74978,0.749720,-1,0.749745,0.749720
13039364,-1,0.749755,0.749730,-1,0.74978,0.749730,-1,0.749745,0.749730
