# Ultimate prices dataset

In [1]:
import os
import pandas as pd
import warnings

from data import DATA_PATH

In [2]:
warnings.filterwarnings('ignore')

### Read stocks data

In [16]:
stocks_path = DATA_PATH / 'stocks'

def _process_stock_dataset(filename: str) -> pd.DataFrame:
    data = pd.read_excel(stocks_path / filename, skiprows=1)
    ticker = filename.split('_')[0].upper()
    data = (
        data.rename(columns={'Дата': 'date', 'Закрытие': 'price'})
        .astype({'date': str})
        .assign(ticker=ticker)
        [['date', 'ticker', 'price']]
    )
    return data

stock_datasets = [
    _process_stock_dataset(filename)
    for filename in os.listdir(stocks_path)
    if '.xlsx' in filename
]

### Read bonds data

In [17]:
bonds_path = DATA_PATH / 'bonds'

def _process_bond_dataset(filename: str) -> pd.DataFrame:
    data = pd.read_csv(bonds_path / filename)
    data = (
        data.rename(columns={'<DATE>': 'date', '<TICKER>': 'ticker', '<CLOSE>': 'price'})
        .assign(date=lambda df: pd.to_datetime(df['date'], dayfirst=True).dt.date.astype('str'))
        .assign(price=lambda df: df['price'].multiply(100))  # convert to abs value
        [['date', 'ticker', 'price']]
    )
    return data

bond_datasets = [
    _process_bond_dataset(filename)
    for filename in os.listdir(bonds_path)
    if '.csv' in filename
]

### Read FX data

In [18]:
fx_path = DATA_PATH / 'risk_factors'

def _process_fx_dataset(filename: str) -> pd.DataFrame:
    data = pd.read_excel(fx_path / filename)
    ticker = filename.split('.')[0].upper()
    data = (
        data.rename(columns={'Дата': 'date', 'Курс': 'price'})
        .astype({'date': str})
        .assign(ticker=ticker)
    )
    return data

fx_datasets = [
    _process_fx_dataset(filename)
    for filename in ('eur_rub.xlsx', 'usd_rub.xlsx')
]

### Check data quality

In [26]:
all_datasets = stock_datasets + bond_datasets + fx_datasets
prices_df = (
    pd.concat(all_datasets)
    .set_index(['date', 'ticker'])
    .dropna()
    .sort_index()
)

In [27]:
prices_df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,price
date,ticker,Unnamed: 2_level_1
2023-12-30,USD_RUB,89.6883
2023-12-31,EUR_RUB,99.1919
2023-12-31,USD_RUB,89.6883
2024-01-01,EUR_RUB,99.1919
2024-01-01,USD_RUB,89.6883


In [28]:
prices_df.shape

(17278, 1)

In [29]:
N_STOCKS = 10
N_BONDS = 5
N_CCY_PAIRS = 2

In [30]:
is_all_prices_on_date = (prices_df.groupby('date').count() == N_STOCKS + N_BONDS + N_CCY_PAIRS)
dates_to_drop = is_all_prices_on_date[~is_all_prices_on_date.values].index.to_list()
prices_df = prices_df.drop(dates_to_drop)

In [31]:
prices_df.shape

(15980, 1)

### Save dataset

In [32]:
prices_df.to_csv(DATA_PATH / 'prices.csv')