### Import packages

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Read in Raw file

In [5]:
# File path
data_route = "../../data/raw_data.csv"
data = pd.read_csv(data_route)
data['Date'] = pd.to_datetime(data['Date'])

In [6]:

daily_data = (
    data.groupby(['Ticker', pd.Grouper(key='Date', freq='D')])  # Group by Ticker and Date (daily frequency)
    .agg({
        'Open': 'first',   # First price of the day
        'High': 'max',     # Highest price of the day
        'Low': 'min',      # Lowest price of the day
        'Close': 'last',   # Last price of the day
        'Volume': 'sum',   # Total volume of the day
        'Ticker': 'size'   # Count the number of hourly data points (transactions)
    })
    .rename(columns={'Ticker': 'Daily_Transactions'})  # Rename the count column
    .reset_index()
)

# Calculate the total number of days of data for each ticker
days_per_ticker = daily_data.groupby('Ticker')['Date'].nunique()

# Display the results
print("Number of days of data for each ticker:")
print(days_per_ticker)

print("\nDaily data with number of transactions:")
print(daily_data.head(10))

# Average daily number of transactions
avg_daily_transactions = daily_data.groupby('Ticker')['Daily_Transactions'].mean()
print("\nAverage daily number of transactions for each ticker:")
print(avg_daily_transactions)


Number of days of data for each ticker:
Ticker
BTC-USD     709
DOGE-USD    709
ETH-USD     709
SOL-USD     709
SPY         485
XRP-USD     709
Name: Date, dtype: int64

Daily data with number of transactions:
    Ticker                      Date          Open          High  \
0  BTC-USD 2023-04-01 00:00:00+00:00  28473.332031  28802.457031   
1  BTC-USD 2023-04-02 00:00:00+00:00  28462.845703  28518.958984   
2  BTC-USD 2023-04-03 00:00:00+00:00  28183.080078  28475.623047   
3  BTC-USD 2023-04-04 00:00:00+00:00  27795.273438  28433.742188   
4  BTC-USD 2023-04-05 00:00:00+00:00  28169.726562  28739.238281   
5  BTC-USD 2023-04-06 00:00:00+00:00  28175.226562  28178.384766   
6  BTC-USD 2023-04-07 00:00:00+00:00  28038.966797  28111.593750   
7  BTC-USD 2023-04-08 00:00:00+00:00  27923.943359  28159.863281   
8  BTC-USD 2023-04-09 00:00:00+00:00  27952.367188  28532.830078   
9  BTC-USD 2023-04-10 00:00:00+00:00  28336.027344  29771.464844   

            Low         Close        Volum

This finding makes sense, as SPY does not operate over the weekends.

## Features engineering
Now we have the calculate the key metrics, similar to how we did it in the group component

### Create the hourly, 3-hourly, daily, weekly, and monthly 
Defined by Corsi 2009, weekly RV is the rolling average of 7 days RV, and monthly RV is the rolling average of 30 days RV.

In [7]:

# Ensure Date is datetime, which accounts for hourly data
data['Date'] = pd.to_datetime(data['Date'])

# Sort data by Ticker and Date
data = data.sort_values(['Ticker', 'Date'])

# Calculate hourly log returns
data['ln_hourly_return'] = data.groupby('Ticker', group_keys=False)['Close'].apply(lambda x: np.log(x).diff())

# Calculate 3-hourly log returns
data['ln_3_hourly_return'] = data.groupby('Ticker', group_keys=False)['Close'].apply(lambda x: np.log(x).diff(3))

# Fill the forward fill missing daily log returns
# data['ln_daily_return'] = data.groupby('Ticker')['ln_daily_return'].ffill()
# Calculate realized variances
data['hourly_rv'] = data['ln_hourly_return']**2

# 3-hourly RV: Sum of squared log returns over 3-hour windows
data['3_hourly_rv'] = (
    data.groupby(['Ticker', pd.Grouper(key='Date', freq='3H')])['ln_hourly_return']
    .transform(lambda x: (x**2).sum())
)

# Daily RV: Sum of squared hourly returns per day
data['Date_floor'] = data['Date'].dt.floor('D')  # Create a floored date column for daily grouping
daily_rv = (
    data.groupby(['Ticker', 'Date_floor'])['hourly_rv']
    .sum()
    .reset_index()
    .rename(columns={'hourly_rv': 'daily_rv'})
)

# Merge daily RV back into the original DataFrame
data = data.merge(daily_rv, on=['Ticker', 'Date_floor'], how='left')

# Logarithmic transformations for RV
data['ln_hourly_rv'] = np.log(data['hourly_rv']).replace(-np.inf, 0)
data['ln_3_hourly_rv'] = np.log(data['3_hourly_rv']).replace(-np.inf, 0)


# Prepare daily DataFrame for weekly and monthly RV calculations
daily_df = data[['Ticker', 'Date_floor', 'daily_rv']].drop_duplicates()

# Calculate daily log RV
daily_df['ln_daily_rv'] = np.log(daily_df['daily_rv']).replace(-np.inf, 0)

# Calculate weekly RV (7-day rolling average)
daily_df['weekly_rv'] = (
    daily_df.groupby('Ticker')['daily_rv']
    .transform(lambda x: x.rolling(window=7, min_periods=7).sum() / 7)
)

# Logarithmic transformation for weekly RV
daily_df['ln_weekly_rv'] = np.log(daily_df['weekly_rv']).replace(-np.inf, 0)

# Calculate monthly RV (30-day rolling average)
daily_df['monthly_rv'] = (
    daily_df.groupby('Ticker')['daily_rv']
    .transform(lambda x: x.rolling(window=30, min_periods=30).sum() / 30)
)
# Logarithmic transformation for monthly RV
daily_df['ln_monthly_rv'] = np.log(daily_df['monthly_rv']).replace(-np.inf, 0)

# Lag the ln daily RV by one day
daily_df['ln_daily_rv_lag1'] = daily_df.groupby('Ticker')['ln_daily_rv'].shift(1)
daily_df['ln_daily_rv_lag2'] = daily_df.groupby('Ticker')['ln_daily_rv'].shift(2)

# Lag the ln weekly, and monthly RV by one and two days
daily_df['ln_weekly_rv_lag1'] = daily_df.groupby('Ticker')['ln_weekly_rv'].shift(1)
daily_df['ln_weekly_rv_lag2'] = daily_df.groupby('Ticker')['ln_weekly_rv'].shift(2)
daily_df['ln_monthly_rv_lag1'] = daily_df.groupby('Ticker')['ln_monthly_rv'].shift(1)
daily_df['ln_monthly_rv_lag2'] = daily_df.groupby('Ticker')['ln_monthly_rv'].shift(2)

# Merge weekly and monthly RV back into the original DataFrame
data = data.merge(daily_df, on=['Ticker', 'Date_floor'], how='left')

# Drop the temporary Date_floor column if no longer needed
data.drop(columns=['Date_floor'], inplace=True)


  data.groupby(['Ticker', pd.Grouper(key='Date', freq='3H')])['ln_hourly_return']
  result = getattr(ufunc, method)(*inputs, **kwargs)


### Lag the RV by 1



In [8]:
# Lag all realized variance measures by 1
data['ln_hourly_rv_lag1'] = data.groupby('Ticker')['ln_hourly_rv'].shift(1)
data['ln_3_hourly_rv_lag1'] = data.groupby('Ticker')['ln_3_hourly_rv'].shift(3)

# Now lag all rv by 2
data['ln_hourly_rv_lag2'] = data.groupby('Ticker')['ln_hourly_rv'].shift(2)
data['ln_3_hourly_rv_lag2'] = data.groupby('Ticker')['ln_3_hourly_rv'].shift(6)

# Lag the returns by 1 too
data['ln_hourly_return_lag1'] = data.groupby('Ticker')['ln_hourly_return'].shift(1)
data['ln_3_hourly_return_lag1'] = data.groupby('Ticker')['ln_3_hourly_return'].shift(3)

# Lag the returns by 2 too
data['ln_hourly_return_lag2'] = data.groupby('Ticker')['ln_hourly_return'].shift(2)
data['ln_3_hourly_return_lag2'] = data.groupby('Ticker')['ln_3_hourly_return'].shift(6)

data['hourly_rv_lag1'] = data.groupby('Ticker')['hourly_rv'].shift(1)
data['hourly_rv_lag2'] = data.groupby('Ticker')['hourly_rv'].shift(2)

data['three_hourly_rv_lag1'] = data.groupby('Ticker')['3_hourly_rv'].shift(3)
data['three_hourly_rv_lag2'] = data.groupby('Ticker')['3_hourly_rv'].shift(6)

# Save the data to a new CSV file
data.to_csv('../../data/processed_data.csv', index=False)


### Drop all non-lg data

In [9]:
# Rename daily_rv_x to daily_rv, and drop the other daily_rv_y column
data['daily_rv'] = data['daily_rv_x']
data = data.drop(columns=['daily_rv_x', 'daily_rv_y'])


## Classification
In my model, I wish to classify each coin as either high risk, medium risk, or low risk, based on their hourly realised variance. High RV constitutes as high risk, and likewise for medium risk and low risk. I've used a weighting system to determine the risk level

In [10]:

# Aggregate hourly, 3-hourly, and daily realized variance for each ticker
ticker_rv_aggregated = (
    data.groupby('Ticker')
    .agg({
        'ln_hourly_rv': 'mean',
        'ln_3_hourly_rv': 'mean',
        'ln_daily_rv': 'mean'
    })
    .reset_index()
)

# Define weights for each metric --> Give more weights to daily RV, since it's more stable
weights = {
    'ln_hourly_rv': 0.2,  # 20% weight
    'ln_3_hourly_rv': 0.3,  # 30% weight
    'ln_daily_rv': 0.5  # 50% weight
}

# Calculate the composite score
ticker_rv_aggregated['Composite_Score'] = (
    ticker_rv_aggregated['ln_hourly_rv'] * weights['ln_hourly_rv'] +
    ticker_rv_aggregated['ln_3_hourly_rv'] * weights['ln_3_hourly_rv'] +
    ticker_rv_aggregated['ln_daily_rv'] * weights['ln_daily_rv']
)

# Define dynamic bins based on the composite score
min_score = ticker_rv_aggregated['Composite_Score'].min()
max_score = ticker_rv_aggregated['Composite_Score'].max()

# Create 3 bins (Low, Medium, High) using percentiles or custom logic
bins = [
    min_score, 
    ticker_rv_aggregated['Composite_Score'].quantile(0.33),  # 33rd percentile
    ticker_rv_aggregated['Composite_Score'].quantile(0.66),  # 66th percentile
    max_score
]

bins = sorted(list(set(bins)))  # Remove duplicates and sort

# Classify tickers
risk_labels = ['Low Risk', 'Medium Risk', 'High Risk']  # Initial categories
ticker_rv_aggregated['Risk'] = pd.cut(
    ticker_rv_aggregated['Composite_Score'],
    bins=bins,
    labels=risk_labels,  # Use the initial categories
    include_lowest=True  # Include the minimum value in the first bin
)

# Add "Baseline" to the categories of the 'Risk' column
ticker_rv_aggregated['Risk'] = ticker_rv_aggregated['Risk'].cat.add_categories('Baseline')

# For the ticker "SPY": Set the risk as "Baseline"
ticker_rv_aggregated.loc[ticker_rv_aggregated['Ticker'] == 'SPY', 'Risk'] = 'Baseline'

# Sort by Risk
ticker_rv_aggregated = ticker_rv_aggregated.sort_values('Risk')

# Display the resulting DataFrame
print(ticker_rv_aggregated)

# Merge back to original data
data = data.merge(ticker_rv_aggregated[['Ticker', 'Risk']], on='Ticker', how='left')

# Just to be sure, make sure date is in datetime format
data['Date'] = pd.to_datetime(data['Date'])

     Ticker  ln_hourly_rv  ln_3_hourly_rv  ln_daily_rv  Composite_Score  \
0   BTC-USD    -12.712268      -10.660145    -7.940902        -9.710948   
2   ETH-USD    -12.349248      -10.259421    -7.547093        -9.321223   
5   XRP-USD    -11.921673       -9.850787    -7.171542        -8.925341   
1  DOGE-USD    -11.577992       -9.501986    -6.815516        -8.573952   
3   SOL-USD    -11.167331       -9.110837    -6.502131        -8.217783   
4       SPY    -13.597308      -11.926745   -10.257267       -11.426119   

          Risk  
0     Low Risk  
2  Medium Risk  
5  Medium Risk  
1    High Risk  
3    High Risk  
4     Baseline  


In [11]:
# Save the data
data.to_csv("../../data/processed_data.csv", index=False)