In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

In [2]:
import os

# Run the setup script
%run ../config/setup_project.py

# Call the function to set up the project path
setup_project_path()

# Now you can import your modules
from src.utils import helper as h_
from ch_02 import code_ch_02 as ch2


Project root added to sys.path: /Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management/docs/references/De_Prado
Current sys.path: ['/Users/paulkelendji/miniconda3/envs/financial_math/lib/python38.zip', '/Users/paulkelendji/miniconda3/envs/financial_math/lib/python3.8', '/Users/paulkelendji/miniconda3/envs/financial_math/lib/python3.8/lib-dynload', '', '/Users/paulkelendji/miniconda3/envs/financial_math/lib/python3.8/site-packages', '/Users/paulkelendji/Desktop/GitHub_paul/ML-Asset_Management/docs/references/De_Prado']


In [3]:
df = pd.read_parquet("../Data/IVE_kibot.parq")
df.head()

Unnamed: 0_level_0,price,bid,ask,size,v,dv
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-09-28 09:30:00,50.79,50.7,50.79,100,100,5079.0
2009-09-28 09:30:00,50.71,50.7,50.79,638,638,32352.98
2009-09-28 09:31:32,50.75,50.75,50.76,100,100,5075.0
2009-09-28 09:31:33,50.75,50.72,50.75,100,100,5075.0
2009-09-28 09:31:50,50.75,50.73,50.76,300,300,15225.0


In [4]:
# load ../data/variables_ch2.pkl
%run ../ch_02/code_ch_02.py

path = '../Data/variables_ch2.pkl'
import pickle
with open(path, 'rb') as f:
    bars = pickle.load(f)
    bar_time = pickle.load(f)


### SNIPPET 3.1 DAILY VOLATILITY ESTIMATES

In [5]:
def getDailyVol(close, span0=100):
    # daily vol, reindexed to close
    df0 = close.index.searchsorted(close.index - pd.Timedelta(days=1))
    df0 = df0[df0 > 0]
    df0 = pd.Series(close.index[df0 - 1], index=close.index[close.shape[0] - df0.shape[0]:])
    df0 = close.loc[df0.index] / close.loc[df0.values].values - 1  # daily returns
    df0 = df0.ewm(span=span0).std()
    return df0

In [6]:
getDailyVol(bar_time.df_OLHC.set_index('time_close')['close'])

time_close
2009-10-01 09:29:55         NaN
2009-10-02 09:30:00    0.016992
2009-10-02 16:00:00    0.012555
2009-10-05 09:30:00    0.019462
2009-10-06 09:29:57    0.026989
                         ...   
2024-07-23 09:30:00    0.007871
2024-07-24 09:30:00    0.007815
2024-07-25 09:30:00    0.007811
2024-07-26 09:30:00    0.007754
2024-07-26 16:00:00    0.007903
Name: close, Length: 4521, dtype: float64

### SNIPPET 3.2 TRIPLE-BARRIER LABELING METHOD

In [7]:
def applyPtSlOnT1(close, events, ptSl, molecule):
    # apply stop loss/profit taking, if it takes place before t1 (end of event)
    events = events.loc[molecule]
    out = events[['t1']].copy(deep=True)

    if ptSl[0] > 0:
        pt = ptSl[0] * events['trgt']
    else:
        pt = pd.Series(index=events.index)  # NaNs

    if ptSl[1] > 0:
        sl = -ptSl[1] * events['trgt']
    else:
        sl = pd.Series(index=events.index)  # NaNs

    for loc, t1 in events['t1'].fillna(close.index[-1]).iteritems():
        df0 = close[loc:t1]  # path prices
        df0 = (df0 / close[loc] - 1) * events.at[loc, 'side']  # path returns

        out.loc[loc, 'sl'] = df0[df0 < sl[loc]].index.min()  # earliest stop loss
        out.loc[loc, 'pt'] = df0[df0 > pt[loc]].index.min()  # earliest profit taking

    return out

### SNIPPET 3.3 GETTING THE TIME OF FIRST TOUCH

In [None]:
def getEvents(close, tEvents, ptSl, trgt, minRet, numThreads, t1=False):
    # 1) get target
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > minRet]  # minRet

    # 2) get t1 (max holding period)
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=tEvents)

    # 3) form events object, apply stop loss on t1
    side_ = pd.Series(1., index=trgt.index)
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])

    df0 = mpPandasObj(func=applyPtSlOnT1, pdObj=('molecule', events.index), numThreads=numThreads, close=close, events=events, ptSl=[ptSl, ptSl])

    events['t1'] = df0.dropna(how='all').min(axis=1)  # pd.min ignores nan
    events = events.drop('side', axis=1)
    return events

# in bars['Dollar'].df_OLHC, tEvents is the indexes where bars['Dollar'].df_OLHC['cusum']==1


#### SNIPPET 3.4 ADDING A VERTICAL BARRIER

In [None]:
t1=close.index.searchsorted(tEvents+pd.Timedelta(days=numDays)) 
t1=t1[t1<close.shape[0]] 
t1=pd.Series(close.index[t1],index=tEvents[:t1.shape[0]]) # NaNs at end

#### SNIPPET 3.5 LABELING FOR SIDE AND SIZE

In [None]:
def getBins(events,close):
    #1) prices aligned with events
    events_=events.dropna(subset=['t1']) 
    px=events_.index.union(events_['t1'].values).drop_duplicates() 
    px=close.reindex(px,method='bfill')
    #2) create out object
    out=pd.DataFrame(index=events_.index) 
    out['ret']=px.loc[events_['t1'].values].values/px.loc[events_.index]-1 
    out['bin']=np.sign(out['ret'])
    return out

### Understanding Learning Side and Size of a Bet

1. **Learning Side and Size of a Bet**:
    - **Side**: This refers to the direction of the bet - whether it is a long position (betting the price will go up) or a short position (betting the price will go down).
    - **Size**: This refers to the magnitude or amount of the bet. It determines how much you should bet given the level of confidence in the prediction.

In the context of this section, you aim to label the data in a way that helps a machine learning algorithm learn both the direction (side) and the size (magnitude) of a bet. This is crucial when you don't have a model to directly dictate the sign (direction) of your position. Instead, you will use the data to infer these aspects.

2. **Feature Engineering and Preprocessing**:
    - **Dataset**: You have a series of closing prices for some financial instrument.
    - **Objective**: To process this data into a form that a machine learning model can learn from to make predictions about future bets (side and size).
    - **Triple-Barrier Method**: This method involves setting profit-taking, stop-loss, and a time limit barrier around each event (a significant change or trigger point in the price series). The aim is to observe which barrier the price touches first.
    - **Labeling**: By applying these barriers, you can label the data with:
        - The outcome of the bet (profit or loss, which side was touched).
        - The magnitude of the price movement (how far did it go before touching a barrier).
    - **Sampling**: The data points are sampled around these events, and their respective labels (side and size) are used to train the machine learning model.

### Breakdown of Snippet 3.3 `getEvents` Function

```python
def getEvents(close, tEvents, ptSl, trgt, minRet, numThreads, t1=False):
    # 1) get target
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > minRet]  # minRet

    # 2) get t1 (max holding period)
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=tEvents)

    # 3) form events object, apply stop loss on t1
    side_ = pd.Series(1., index=trgt.index)
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])
    df0 = mpPandasObj(func=applyPtSlOnT1, pdObj=('molecule', events.index), 
                      numThreads=numThreads, close=close, events=events, ptSl=[ptSl, ptSl])
    events['t1'] = df0.dropna(how='all').min(axis=1)  # pd.min ignores nan
    events = events.drop('side', axis=1)
    return events
```

- **Step-by-Step Explanation**:
    1. **Get Target**:
        - `trgt = trgt.loc[tEvents]`: Targets are extracted at event timestamps.
        - `trgt = trgt[trgt > minRet]`: Filter targets that are greater than a minimum return threshold.
    2. **Get t1 (Max Holding Period)**:
        - `if t1 is False: t1 = pd.Series(pd.NaT, index=tEvents)`: If `t1` (maximum holding period) is not provided, create a series of NaT (Not a Time) indexed by event timestamps.
    3. **Form Events Object**:
        - `side_ = pd.Series(1., index=trgt.index)`: Create a series of `1`s to denote the side (direction) of each event (default is long).
        - `events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])`: Combine `t1`, `trgt`, and `side` into a dataframe, dropping any rows where `trgt` is NaN.
    4. **Apply Stop Loss and Profit Taking**:
        - `df0 = mpPandasObj(...)`: This function applies stop-loss and profit-taking levels in parallel using multiple threads. It calls the function `applyPtSlOnT1` on each subset (`molecule`) of the event indices.
        - `events['t1'] = df0.dropna(how='all').min(axis=1)`: Update `t1` with the earliest time a barrier (stop-loss or profit-taking) is touched.
        - `events = events.drop('side', axis=1)`: Drop the `side` column as it's not needed further.

### Interpretation and Use of `tEvents`

- **tEvents**: This is a series of timestamps indicating where significant events occur in the price series. These events are points where you believe there might be a potential change in the trend or some action should be taken (like evaluating whether to buy or sell).
- **Usage**: These timestamps serve as the starting points for applying the triple-barrier method, which then evaluates how the price moves after each event and whether it hits the predefined barriers (profit-taking, stop-loss, or the maximum holding period).

### Summarizing `trgt` (Targets)

- **trgt**: This represents the target returns, expressed in absolute returns. It determines the price movement threshold that must be met or exceeded for a particular event. This value is used to set the width of the profit-taking and stop-loss barriers.
- **From Bar to Bar**: In the context of bars (like dollar bars), the targets might be the returns from one bar to the next. When applying the triple-barrier method, these targets help in defining the horizontal barriers to see if they are breached within a specified holding period.

# =============================================================================

Is my understanding correct? So we have a dataset of closing price, and this section is about feature engineering and preprocessign out dataset by sampling relevant datapoints, and assigning appropriate size and size to pick so that the ml algorithm can learn from and apply in unseen data

# =============================================================================

Yes, your understanding is correct. Let's break it down further to ensure clarity:

1. **Dataset**:
   - You have a dataset consisting of closing prices for some financial instrument, likely in the form of time series data.

2. **Objective**:
   - The main objective is to preprocess this dataset to extract relevant features and labels that will help a machine learning algorithm learn how to predict the side (direction) and size (magnitude) of a bet.

3. **Feature Engineering and Preprocessing**:
   - **Triple-Barrier Method**: This method involves setting up three barriers around each event:
     - **Profit-Taking Barrier**: A threshold that, when crossed, indicates a profitable exit point.
     - **Stop-Loss Barrier**: A threshold that, when crossed, indicates a loss and signals to exit the trade to prevent further losses.
     - **Time Limit Barrier**: A maximum holding period, after which the position is closed regardless of whether the profit or stop-loss barriers have been touched.
   - **Events (tEvents)**: These are specific timestamps in your data where significant events occur (e.g., large price movements). These events are used as starting points for applying the triple-barrier method.
   - **Targets (trgt)**: These represent the expected returns, setting the width of the profit-taking and stop-loss barriers. Targets help determine the thresholds for these barriers based on historical price movements.

4. **Process**:
   - **Identify Significant Events**: Use `tEvents` to mark timestamps of potential trading signals or significant market movements.
   - **Apply Barriers**: For each event, apply the profit-taking, stop-loss, and time limit barriers.
   - **Label Data**: Determine which barrier is touched first and label the data accordingly. The label includes:
     - The side of the bet (whether the price went up or down).
     - The size of the bet (the magnitude of the price movement).
   - **Sampling**: Sample the data around these events and the corresponding labels to create a dataset that the machine learning algorithm can learn from.

5. **Purpose**:
   - The processed data with features (like the closing prices and barrier crossings) and labels (side and size of bets) will be used to train a machine learning model.
   - The model learns patterns in the data that help it predict the side and size of future bets based on similar historical patterns.

By preprocessing and feature engineering your dataset in this manner, you ensure that the machine learning model is provided with meaningful and structured data, enabling it to learn effectively and make accurate predictions on unseen data.

### Detailed Steps in Code:

1. **Define Targets**:
   - Calculate the target returns and filter out those below a minimum threshold (`minRet`).

2. **Determine Holding Period**:
   - Set a maximum holding period (`t1`), if not provided, initialize it.

3. **Form Events Object**:
   - Combine the targets, holding period, and a default side (direction) into an `events` DataFrame.

4. **Apply Barriers**:
   - Use the `mpPandasObj` function to apply the stop-loss and profit-taking barriers across the dataset in parallel.
   - Update the holding period (`t1`) with the earliest time a barrier is touched.

5. **Return Processed Events**:
   - Return the processed `events` DataFrame, which includes the times when barriers are touched and the corresponding target returns.

By following these steps, you transform your raw dataset of closing prices into a structured dataset that is suitable for training a machine learning model to predict trading signals and bet sizes.

# =============================================================================

## How the different functions work together:

# =============================================================================

Let's break down how each of these functions works and how they fit together in the overall process of labeling data for a machine learning model in the context of the triple-barrier method.

### SNIPPET 3.2: TRIPLE-BARRIER LABELING METHOD

#### Function: `applyPtSlOnT1`

```python
def applyPtSlOnT1(close, events, ptSl, molecule):
    # apply stop loss/profit taking, if it takes place before t1 (end of event)
    events = events.loc[molecule]
    out = events[['t1']].copy(deep=True)

    if ptSl[0] > 0:
        pt = ptSl[0] * events['trgt']
    else:
        pt = pd.Series(index=events.index)  # NaNs

    if ptSl[1] > 0:
        sl = -ptSl[1] * events['trgt']
    else:
        sl = pd.Series(index=events.index)  # NaNs

    for loc, t1 in events['t1'].fillna(close.index[-1]).iteritems():
        df0 = close[loc:t1]  # path prices
        df0 = (df0 / close[loc] - 1) * events.at[loc, 'side']  # path returns

        out.loc[loc, 'sl'] = df0[df0 < sl[loc]].index.min()  # earliest stop loss
        out.loc[loc, 'pt'] = df0[df0 > pt[loc]].index.min()  # earliest profit taking

    return out
```

**Explanation**:
1. **Initialize Output**: The function starts by selecting a subset (`molecule`) of the `events` DataFrame and initializing the output DataFrame `out` with a deep copy of the `t1` column.
2. **Profit-Taking and Stop-Loss Targets**: It then sets up the profit-taking (`pt`) and stop-loss (`sl`) targets based on the `ptSl` values and the targets (`trgt`) for each event.
3. **Iterate Through Events**: For each event (identified by `loc`), it computes the returns from the `close` prices, adjusts them by the side of the bet (`side`), and determines the earliest time the stop-loss (`sl`) or profit-taking (`pt`) barriers are hit.
4. **Update Output**: It updates the `out` DataFrame with the earliest times for stop-loss and profit-taking.

### SNIPPET 3.3: GETTING THE TIME OF FIRST TOUCH

#### Function: `getEvents`

```python
def getEvents(close, tEvents, ptSl, trgt, minRet, numThreads, t1=False):
    # 1) get target
    trgt = trgt.loc[tEvents]
    trgt = trgt[trgt > minRet]  # minRet

    # 2) get t1 (max holding period)
    if t1 is False:
        t1 = pd.Series(pd.NaT, index=tEvents)

    # 3) form events object, apply stop loss on t1
    side_ = pd.Series(1., index=trgt.index)
    events = pd.concat({'t1': t1, 'trgt': trgt, 'side': side_}, axis=1).dropna(subset=['trgt'])

    df0 = mpPandasObj(func=applyPtSlOnT1, pdObj=('molecule', events.index), numThreads=numThreads, close=close, events=events, ptSl=[ptSl, ptSl])

    events['t1'] = df0.dropna(how='all').min(axis=1)  # pd.min ignores nan
    events = events.drop('side', axis=1)
    return events
```

**Explanation**:
1. **Filter Targets**: The function first filters the `trgt` series to include only those events where the target exceeds `minRet`.
2. **Max Holding Period**: It sets up the maximum holding period (`t1`), defaulting to NaT if not provided.
3. **Form Events Object**: It creates an `events` DataFrame that includes the `t1` (holding period), `trgt` (targets), and `side` (default set to 1).
4. **Apply Barriers**: It uses the `mpPandasObj` function to apply the `applyPtSlOnT1` function in parallel, computing the times when the stop-loss or profit-taking barriers are first hit.
5. **Update Events**: It updates the `t1` column in the `events` DataFrame with the earliest times barriers were hit and removes the `side` column.

### SNIPPET 3.4: ADDING A VERTICAL BARRIER

#### Code:

```python
t1 = close.index.searchsorted(tEvents + pd.Timedelta(days=numDays))
t1 = t1[t1 < close.shape[0]]
t1 = pd.Series(close.index[t1], index=tEvents[:t1.shape[0]])  # NaNs at end
```

**Explanation**:
1. **Calculate Vertical Barrier**: This code calculates the vertical barrier (maximum holding period) by finding the index positions of `tEvents` plus a specified number of days (`numDays`).
2. **Filter Valid Indexes**: It ensures the index positions are within the bounds of the `close` DataFrame.
3. **Create Series**: It creates a `Series` with the timestamps of the vertical barriers, indexed by `tEvents`.

### SNIPPET 3.5: LABELING FOR SIDE AND SIZE

#### Function: `getBins`

```python
def getBins(events, close):
    # 1) prices aligned with events
    events_ = events.dropna(subset=['t1'])
    px = events_.index.union(events_['t1'].values).drop_duplicates()
    px = close.reindex(px, method='bfill')
    
    # 2) create out object
    out = pd.DataFrame(index=events_.index)
    out['ret'] = px.loc[events_['t1'].values].values / px.loc[events_.index] - 1
    out['bin'] = np.sign(out['ret'])
    
    return out
```

**Explanation**:
1. **Align Prices with Events**: The function first aligns the prices in the `close` DataFrame with the events by reindexing `close` to include both the event times and the barrier hit times.
2. **Calculate Returns**: It calculates the returns (`ret`) for each event by comparing the closing price at the barrier hit time (`t1`) with the closing price at the event time.
3. **Determine Bin**: It determines the bin (side) of each return, which is simply the sign of the return (`1` for positive, `-1` for negative).
4. **Create Output**: It creates an output DataFrame that includes the returns and the bins (sides) for each event.

### How They Work Together:

1. **Adding Vertical Barriers (SNIPPET 3.4)**:
   - This snippet is used to set up the maximum holding period (`t1`) for each event in `tEvents`.

2. **Getting the Time of First Touch (SNIPPET 3.3)**:
   - The `getEvents` function utilizes the `tEvents` and the vertical barriers (`t1`) to apply the triple-barrier method. It calls `applyPtSlOnT1` to determine when the stop-loss or profit-taking barriers are hit and updates the `events` DataFrame with these times.

3. **Applying Profit-Taking and Stop-Loss (SNIPPET 3.2)**:
   - The `applyPtSlOnT1` function is called within `getEvents` to apply the stop-loss and profit-taking barriers and return the times when these barriers are hit.

4. **Labeling for Side and Size (SNIPPET 3.5)**:
   - Finally, the `getBins` function takes the `events` DataFrame (now containing the times when barriers were hit) and calculates the returns and sides for each event, resulting in a labeled dataset ready for machine learning.

In summary, these snippets work together to preprocess the dataset by identifying significant events, applying the triple-barrier method, and labeling the data with the side and size of the bet. This labeled dataset is then ready for training a machine learning model to predict trading signals.