In [None]:
import pandas as pd
import re
import numpy as np
import logging

In [None]:
file_path = '/Users/vittoriomanfriani/Desktop/bond_data.xlsx'
excel_data = pd.ExcelFile(file_path)
df_raw = excel_data.parse('Sheet1', header=None)

In [None]:
# Extract the header row to identify ISINs and their column positions
header = df_raw.iloc[0]
header = [val.split()[0] if isinstance(val, str) and "ISIN" in val else val for val in header]

In [None]:
# Identify ISIN positions and their corrected names
isin_positions = [(i, val) for i, val in enumerate(header) if isinstance(val, str) and re.match(r'^[A-Z0-9]{12}$', val)]

In [None]:
# Function to convert only numeric dates
def convert_to_datetime(val):
    if isinstance(val, (int, float)):  # Check if the value is numeric
        return pd.to_datetime(val, origin='1899-12-30', unit='D')
    elif isinstance(val, str):  # Check if the value is already a date string
        return pd.to_datetime(val, errors='coerce')  # Convert if it's a valid date
    return val  # Return as-is if it doesn't match above cases

In [None]:
# Process each ISIN section and collect data
data_frames = []
for start_col, isin in isin_positions:
    # Determine the end column for each ISIN section (4 columns per ISIN)
    end_col = start_col + 4
    temp_df = df_raw.iloc[2:, start_col:end_col]  # Skip the first two rows (headers)
    temp_df.columns = ["date", "mid_price", "yield", "mid_dv01"]
    temp_df["isin"] = isin
    
    temp_df['date'] = temp_df['date'].apply(convert_to_datetime)
    
    temp_df.dropna(subset=['date'], inplace=True)  # Drop rows with any remaining invalid dates
    
    data_frames.append(temp_df)

In [None]:
# Concatenate all ISIN data into a single DataFrame
final_df = pd.concat(data_frames, ignore_index=True)
final_df.set_index(["date", "isin"], inplace=True)
final_df = final_df.sort_index()

In [None]:
final_df.to_csv('USBond_Dataset_Ordered.csv')

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
ccy = "USD"
symbol_name = 'Bond'

In [None]:
# check that dates are daily
dates = final_df.index.get_level_values("date")
assert isinstance(dates, pd.DatetimeIndex)

dates_diff = dates.unique().to_series().diff()[1:] / np.timedelta64(1, 'h')

if np.any(dates_diff < 20):
     raise Exception('There are observations for which the timedelta is less than 1 day')

avg_diff = np.mean(dates_diff)
logger.info(f'Average timedelta for {ccy}_{symbol_name}: {avg_diff}')

if not (avg_diff > 24 and avg_diff < 48):
    raise Exception('For (more or less) daily observations, the timedelta between each observation'
        'should be between 1 and 2 days, while the average timedelta in the data'
        f'is {avg_diff} days.')