### Import packages

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Read in Raw file

In [22]:
# File path
data_route = "../../data/raw_data.csv"
data = pd.read_csv(data_route)
data['Date'] = pd.to_datetime(data['Date'])

In [23]:
# Check the data
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ticker
0,2023-04-03 13:30:00+00:00,408.850006,411.309998,408.820007,411.190002,14278892.0,SPY
1,2023-04-03 14:30:00+00:00,411.200012,411.369995,408.440002,409.040009,10506955.0,SPY
2,2023-04-03 15:30:00+00:00,409.059998,409.890015,408.970001,409.640106,7643987.0,SPY
3,2023-04-03 16:30:00+00:00,409.644989,409.779999,408.950012,409.209991,5003256.0,SPY
4,2023-04-03 17:30:00+00:00,409.220001,409.679993,408.899994,409.609985,6087517.0,SPY


In [24]:

daily_data = (
    data.groupby(['Ticker', pd.Grouper(key='Date', freq='D')])  # Group by Ticker and Date (daily frequency)
    .agg({
        'Open': 'first',   # First price of the day
        'High': 'max',     # Highest price of the day
        'Low': 'min',      # Lowest price of the day
        'Close': 'last',   # Last price of the day
        'Volume': 'sum',   # Total volume of the day
        'Ticker': 'size'   # Count the number of hourly data points (transactions)
    })
    .rename(columns={'Ticker': 'Daily_Transactions'})  # Rename the count column
    .reset_index()
)

# Calculate the total number of days of data for each ticker
days_per_ticker = daily_data.groupby('Ticker')['Date'].nunique()

# Display the results
print("Number of days of data for each ticker:")
print(days_per_ticker)

print("\nDaily data with number of transactions:")
print(daily_data.head(10))

# Average daily number of transactions
avg_daily_transactions = daily_data.groupby('Ticker')['Daily_Transactions'].mean()
print("\nAverage daily number of transactions for each ticker:")
print(avg_daily_transactions)


Number of days of data for each ticker:
Ticker
BTC-USD     697
DOGE-USD    697
ETH-USD     697
SOL-USD     697
SPY         477
XRP-USD     697
Name: Date, dtype: int64

Daily data with number of transactions:
    Ticker                      Date          Open          High  \
0  BTC-USD 2023-04-01 00:00:00+00:00  28473.332031  28802.457031   
1  BTC-USD 2023-04-02 00:00:00+00:00  28462.845703  28518.958984   
2  BTC-USD 2023-04-03 00:00:00+00:00  28183.080078  28475.623047   
3  BTC-USD 2023-04-04 00:00:00+00:00  27795.273438  28433.742188   
4  BTC-USD 2023-04-05 00:00:00+00:00  28169.726562  28739.238281   
5  BTC-USD 2023-04-06 00:00:00+00:00  28175.226562  28178.384766   
6  BTC-USD 2023-04-07 00:00:00+00:00  28038.966797  28111.593750   
7  BTC-USD 2023-04-08 00:00:00+00:00  27923.943359  28159.863281   
8  BTC-USD 2023-04-09 00:00:00+00:00  27952.367188  28532.830078   
9  BTC-USD 2023-04-10 00:00:00+00:00  28336.027344  29771.464844   

            Low         Close        Volum

This finding makes sense, as SPY does not operate over the weekends.

## Features engineering
Now we have the calculate the key metrics, similar to how we did it in the group component

### Create the hourly, 3-hourly, daily, weekly, and monthly 
Defined by Corsi 2009, weekly RV is the rolling average of 7 days RV, and monthly RV is the rolling average of 30 days RV.

In [25]:
import numpy as np
import pandas as pd

# Ensure Date is datetime
data['Date'] = pd.to_datetime(data['Date'])

# Sort data by Ticker and Date
data = data.sort_values(['Ticker', 'Date'])

# Calculate hourly log returns
data['ln_hourly_return'] = data.groupby('Ticker', group_keys=False)['Close'].apply(lambda x: np.log(x).diff())

# Calculate 3-hourly log returns
data['ln_3_hourly_return'] = data.groupby('Ticker', group_keys=False)['Close'].apply(lambda x: np.log(x).diff(3))

# Extract daily closing prices
daily_close = (
    data.groupby(['Ticker', pd.Grouper(key='Date', freq='D')])['Close']
    .last()
    .reset_index()
)

# Calculate daily log returns
daily_close['ln_daily_return'] = daily_close.groupby('Ticker', group_keys=False)['Close'].apply(lambda x: np.log(x).diff())

# Merge daily log returns back into the original DataFrame
data = data.merge(daily_close[['Ticker', 'Date', 'ln_daily_return']], on=['Ticker', 'Date'], how='left')

# Calculate realized variance
data['hourly_rv'] = data['ln_hourly_return']**2

# 3-hourly RV: Sum of squared log returns over 3 hours
data['3_hourly_rv'] = (
    data.groupby(['Ticker', pd.Grouper(key='Date', freq='3H')])['ln_hourly_return']
    .transform(lambda x: (x**2).sum())
)

# Daily RV: Sum of squared hourly log returns over 24 hours
data['daily_rv'] = (
    data.groupby(['Ticker', pd.Grouper(key='Date', freq='D')])['ln_hourly_return']
    .transform(lambda x: (x**2).sum())
)

# Logarithmic transformation of realized variance
data['ln_hourly_rv'] = np.log(data['hourly_rv'])
data['ln_3_hourly_rv'] = np.log(data['3_hourly_rv'])
data['ln_daily_rv'] = np.log(data['daily_rv'])

# Handle -inf for ln_hourly_rv, make it 0
data['ln_hourly_rv'] = data['ln_hourly_rv'].replace(-np.inf, 0)

# Do a forward fill of missing daily RV values
data['ln_daily_rv'] = data.groupby('Ticker')['ln_daily_rv'].ffill() 

# Create a weekly rv
data['weekly_rv'] = data.groupby('Ticker')['daily_rv'].transform(lambda x: x.rolling(window=7*24).mean())
data['ln_weekly_rv'] = np.log(data['weekly_rv'])

# Create a monthly rv
data['monthly_rv'] = data.groupby('Ticker')['daily_rv'].transform(lambda x: x.rolling(window=30).mean())
data['ln_monthly_rv'] = np.log(data['monthly_rv'])

  data.groupby(['Ticker', pd.Grouper(key='Date', freq='3H')])['ln_hourly_return']
  result = getattr(ufunc, method)(*inputs, **kwargs)


### Lag the RV by 1



In [26]:
# Lag all realized variance measures by 1
data['ln_hourly_rv_lag1'] = data.groupby('Ticker')['ln_hourly_rv'].shift(1)
data['ln_3_hourly_rv_lag1'] = data.groupby('Ticker')['ln_3_hourly_rv'].shift(1)
data['ln_daily_rv_lag1'] = data.groupby('Ticker')['ln_daily_rv'].shift(1)
data['ln_weekly_rv_lag1'] = data.groupby('Ticker')['ln_weekly_rv'].shift(1)
data['ln_monthly_rv_lag1'] = data.groupby('Ticker')['ln_monthly_rv'].shift(1)

### Drop all non-lg data

In [27]:
# Drop the intermediate columns
data = data.drop(columns=['hourly_rv', '3_hourly_rv', 'daily_rv', 'weekly_rv', 'monthly_rv'])

## Classification
In my model, I wish to classify each coin as either high risk, medium risk, or low risk, based on their hourly realised variance. High RV constitutes as high risk, and likewise for medium risk and low risk.

In [None]:

# Aggregate hourly, 3-hourly, and daily realized variance for each ticker
ticker_rv_aggregated = (
    data.groupby('Ticker')
    .agg({
        'ln_hourly_rv': 'mean',
        'ln_3_hourly_rv': 'mean',
        'ln_daily_rv': 'mean'
    })
    .reset_index()
)

# Define weights for each metric --> Give more weights to daily RV, since it's more stable
weights = {
    'ln_hourly_rv': 0.2,  # 20% weight
    'ln_3_hourly_rv': 0.3,  # 30% weight
    'ln_daily_rv': 0.5  # 50% weight
}

# Calculate the composite score
ticker_rv_aggregated['Composite_Score'] = (
    ticker_rv_aggregated['ln_hourly_rv'] * weights['ln_hourly_rv'] +
    ticker_rv_aggregated['ln_3_hourly_rv'] * weights['ln_3_hourly_rv'] +
    ticker_rv_aggregated['ln_daily_rv'] * weights['ln_daily_rv']
)

# Define dynamic bins based on the composite score
min_score = ticker_rv_aggregated['Composite_Score'].min()
max_score = ticker_rv_aggregated['Composite_Score'].max()

# Create 3 bins (Low, Medium, High) using percentiles or custom logic
bins = [
    min_score, 
    ticker_rv_aggregated['Composite_Score'].quantile(0.33),  # 33rd percentile
    ticker_rv_aggregated['Composite_Score'].quantile(0.66),  # 66th percentile
    max_score
]

bins = sorted(list(set(bins)))  # Remove duplicates and sort

# Classify tickers
risk_labels = ['Low Risk', 'Medium Risk', 'High Risk']  # Initial categories
ticker_rv_aggregated['Risk'] = pd.cut(
    ticker_rv_aggregated['Composite_Score'],
    bins=bins,
    labels=risk_labels,  # Use the initial categories
    include_lowest=True  # Include the minimum value in the first bin
)

# Add "Baseline" to the categories of the 'Risk' column
ticker_rv_aggregated['Risk'] = ticker_rv_aggregated['Risk'].cat.add_categories('Baseline')

# For the ticker "SPY": Set the risk as "Baseline"
ticker_rv_aggregated.loc[ticker_rv_aggregated['Ticker'] == 'SPY', 'Risk'] = 'Baseline'

# Sort by Risk
ticker_rv_aggregated = ticker_rv_aggregated.sort_values('Risk')

# Display the resulting DataFrame
print(ticker_rv_aggregated)

# Merge back to original data
data = data.merge(ticker_rv_aggregated[['Ticker', 'Risk']], on='Ticker', how='left')

# Just to be sure, make sure date is in datetime format
data['Date'] = pd.to_datetime(data['Date'])

     Ticker  ln_hourly_rv  ln_3_hourly_rv  ln_daily_rv  Composite_Score  \
0   BTC-USD    -12.737050      -10.683420    -7.966684        -9.735778   
2   ETH-USD    -12.376387      -10.285618    -7.575197        -9.348561   
5   XRP-USD    -11.950287       -9.879607    -7.202063        -8.954971   
1  DOGE-USD    -11.601146       -9.521515    -6.835879        -8.594623   
3   SOL-USD    -11.184339       -9.125065    -6.520952        -8.234863   
4       SPY    -13.621181      -11.958249   -10.287492       -11.455457   

          Risk  
0     Low Risk  
2  Medium Risk  
5  Medium Risk  
1    High Risk  
3    High Risk  
4     Baseline  


In [None]:
# Save the data
data.to_csv("../../data/processed_data.csv", index=False)