# Part 1 - Data Analysing
## Chapter 4 - Sample Weights

In [8]:
import gc

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from utils.plotting import plot

from utils import general
from utils.bars_generator import create_dollar_bars



### 4.1
In Chapter 3, we denoted as t1 a pandas series of timestamps where the first
barrier was touched, and the index was the timestamp of the observation. This
was the output of the getEvents function.

#### 4.1 (a) Compute a t1 series on dollar bars derived from E-mini S&P 500 futures tick data.

In [9]:
btc_transactions = general.load_df(f"data/BTCUSDT-trades-*.parquet")
btc_dollar = create_dollar_bars(btc_transactions, 1e8)
daily_vol = general.get_daily_vol(btc_dollar)

btc_dollar_bar_returns = general.returns(btc_dollar['close'])
t0 = general.get_t_events_dynamic_h(
    btc_dollar.index[1:].values,
    btc_dollar_bar_returns.values,
    daily_vol.index.values,
    daily_vol.values,
    0.1
)

del btc_transactions
gc.collect()
print()
print(f'Number of events {len(t0)}.')

Memory usage of dataframe is 2714.63 MB
Memory usage after optimization is: 2714.63 MB
Decreased by 0.0%
Memory usage of dataframe is 0.19 MB
Memory usage after optimization is: 0.19 MB
Decreased by 0.0%

Number of events 72.


#### 4.1 (b) Apply the function mpNumCoEvents to compute the number of overlapping outcomes at each point in time.

In [10]:
def mp_num_co_events(closeIdx, t1, molecule):
    """
    Compute the number of concurrent events per bar.
    +molecule[0] is the date of the first event on which the weight will be computed
    +molecule[-1] is the date of the last event on which the weight will be computed
    Any event that starts before t1[molecule].max() impacts the count.
    """
    #1) find events that span the period [molecule[0],molecule[-1]]
    t1 = t1.fillna(closeIdx[-1])  # unclosed events still must impact other weights
    t1 = t1[t1 >= molecule[0]]  # events that end at or after molecule[0]
    t1 = t1.loc[:t1[molecule].max()]  # events that start at or before t1[molecule].max()
    #2) count events spanning a bar
    iloc = closeIdx.searchsorted(np.array([t1.index[0], t1.max()]))
    count = pd.Series(0, index=closeIdx[iloc[0]:iloc[1] + 1])
    for tIn, tOut in t1.iteritems():
        count.loc[tIn:tOut] += 1
    return count.loc[molecule[0]:t1[molecule].max()]

mp_num_co_events(closeIdx=btc_dollar.index, t1=t1['t1'], molecule=t1.index)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [11]:
t1

array([1714609990028, 1714610710719, 1714612143147, 1714612966069,
       1714613687032, 1714614430768, 1714625439626, 1714629471160,
       1714802822477, 1715052713653, 1715076854630, 1715092893200,
       1715128310280, 1715128545243, 1715149236129, 1715158139905,
       1715158861511, 1715196264472, 1715199584480, 1715199727605,
       1715201145145, 1715234998906, 1715237708834, 1715241732693,
       1715242387905, 1715243293674, 1715246364098, 1715248083059,
       1715250026093, 1715252255238, 1715252440972, 1715257885679,
       1715258204803, 1715262543840, 1715264185112, 1715268393405,
       1715646711497, 1715647622497, 1715649470842, 1715657628292,
       1715666993371, 1715668657025, 1715862641200, 1716017000686,
       1716135815797, 1716145453102, 1716146139029, 1716188234362,
       1716199850662, 1716200910419, 1716212356559, 1716213179574,
       1716296621352, 1716468977815, 1716468992733, 1716500134624,
       1716532002216, 1716534875411, 1716560664029, 1716821622