# Ultimate prices dataset

In [77]:
import os
import pandas as pd
import warnings

from data import DATA_PATH

In [78]:
warnings.filterwarnings('ignore')

### Read stocks data

In [79]:
stocks_path = DATA_PATH / 'stocks'

def _process_stock_dataset(filename: str) -> pd.DataFrame:
    data = pd.read_excel(stocks_path / filename, skiprows=1)
    ticker = filename.split('.')[0]
    data = (
        data.rename(columns={'Дата': 'date', 'Закрытие': 'price'})
        .astype({'date': str})
        .assign(ticker=ticker)
        [['date', 'ticker', 'price']]
    )
    return data

stock_datasets = [
    _process_stock_dataset(filename)
    for filename in os.listdir(stocks_path)
    if '.xlsx' in filename
]

### Read bonds data

In [80]:
bonds_path = DATA_PATH / 'bonds'

def _process_bond_dataset(filename: str) -> pd.DataFrame:
    data = pd.read_csv(bonds_path / filename)
    data = (
        data.rename(columns={'<DATE>': 'date', '<TICKER>': 'ticker', '<CLOSE>': 'price'})
        .assign(date=lambda df: pd.to_datetime(df['date'], dayfirst=True).dt.date.astype('str'))
        .assign(price=lambda df: df['price'].multiply(100))  # convert to abs value
        [['date', 'ticker', 'price']]
    )
    return data

bond_datasets = [
    _process_bond_dataset(filename)
    for filename in os.listdir(bonds_path)
    if '.csv' in filename
]

### Check data quality

In [109]:
all_datasets = stock_datasets + bond_datasets
prices_df = (
    pd.concat(all_datasets)
    .set_index(['date', 'ticker'])
    .dropna()
)

In [110]:
prices_df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,price
date,ticker,Unnamed: 2_level_1
2023-12-25,SU26221RMFS0,7882.9
2023-12-26,SU26221RMFS0,7859.9
2023-12-27,SU26221RMFS0,7845.3
2023-12-28,SU26221RMFS0,7845.0
2023-12-29,SU26221RMFS0,7882.1


In [111]:
prices_df.shape

(14906, 1)

In [141]:
N_STOCKS = 10
N_BONDS = 5
is_all_prices_on_date = (prices_df.groupby('date').count() == N_STOCKS + N_BONDS)
dates_to_drop = is_all_prices_on_date[~is_all_prices_on_date.values].index.to_list()
prices_df = prices_df.drop(dates_to_drop)

In [142]:
prices_df.shape

(14835, 1)

### Save dataset

In [144]:
prices_df.to_csv(DATA_PATH / 'dwh/prices.csv')