In [18]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
data_dir = './data/raw/participants/'

train_news = pd.read_csv(data_dir + "train_news.csv")
train_candles = pd.read_csv(data_dir + 'train_candles.csv')

In [37]:
import pandas as pd

def make_ml_dataset(df: pd.DataFrame,
                    ts_cols: list,
                    single_cols: list,
                    window: int = 5
                   ) -> pd.DataFrame:
    """
    Формирует датасет для обучения из временного ряда (оптимизированный вариант).
    """
    df = df.reset_index(drop=True)
    
    # 0. sanity-check
    df['begin'] = pd.to_datetime(df["begin"])

    out = pd.DataFrame(index=df.index)

    # 1. временные признаки через shift
    for col in ts_cols:
        for j in range(window):
            out[f"{col}{j+1}"] = df[col].shift(window - j - 1)

    # 2. begin / end через rolling (используем astype вместо view)
    begin_int = df["begin"].astype("int64")
    out["begin"] = pd.to_datetime(begin_int.rolling(window).min())
    out["end"]   = pd.to_datetime(begin_int.rolling(window).max())

    # 3. одноразовые колонки (берем из правого края окна)
    for col in single_cols:
        if col != "begin":
            out[col] = df[col]

    # 4. убираем первые window-1 строк (NaN)
    out = out.iloc[window-1:].reset_index(drop=True)

    return out

# На глаз проверяем работу функции
ticker = 'T'
mask = train_candles['ticker'] == ticker
data = train_candles[mask].copy()
print(data.head(3))

make_ml_dataset(
    data,
    ts_cols=['close', 'volume'],
    single_cols=['target_return_1d'],
    window=3,
).head(1)

         open   close    high     low  volume       begin ticker  \
20706  1344.0  1335.4  1355.2  1329.6  106126  2020-06-19      T   
20707  1336.0  1371.8  1378.4  1250.0  263697  2020-06-22      T   
20708  1380.0  1436.2  1456.8  1374.0  394282  2020-06-23      T   

       target_return_1d  target_direction_1d  target_return_20d  \
20706          0.027258                    1           0.289951   
20707          0.046946                    1           0.290276   
20708          0.014761                    1           0.222671   

       target_direction_20d  
20706                     1  
20707                     1  
20708                     1  


Unnamed: 0,close1,close2,close3,volume1,volume2,volume3,begin,end,target_return_1d
0,1335.4,1371.8,1436.2,106126.0,263697.0,394282,2020-06-19,2020-06-23,0.014761


In [38]:
def add_timeseries_features(data, window_size_features):
    # data - история свечей по отдельному тикеру с колонками из оригинальных файлов
    # Добавим фичи из бейзлайна для примера
    # Моментум = процентное изменение цены за window_size дней
    data['momentum'] = (
        data['close'].pct_change(window_size_features)
    )

    # Волатильность = std доходностей за window_size дней
    data['volatility'] = (
        data['close'].pct_change().rolling(window_size_features).std()
    )
    
    # Средняя цена за window_size дней
    data['ma'] = data['close'].rolling(window_size_features).mean()

    # Расстояние от MA (нормализованное)
    data['distance_from_ma'] = (
        (data['close'] - data['ma']) / data['ma']
    )
    data['momentum'] = data['momentum'].fillna(0)
    data['volatility'] = data['volatility'].fillna(0.01)
    data['distance_from_ma'] = data['distance_from_ma'].fillna(0)
    data['ma'] = data['ma'].fillna(data['close'])
    return data

In [125]:
import os
from tqdm import tqdm

def process_data(
        df,
        df_news,
        save_dir,
        prefix='',
        window_size_features=5,
        window_size_ravel=5
    ):

    ts_cols = ["open", "close", "high", "low", "volume"]

    single_cols = ["begin", "ticker", "target_return_1d", "target_direction_1d", 
                "target_return_20d", "target_direction_20d"]

    results = []
    df_news['publish_date'] = pd.to_datetime(df_news['publish_date'])
    os.makedirs(save_dir, exist_ok=True)

    for ticker, data in tqdm(df.groupby(by='ticker')):
        # ================================================================
        # 1. Предобработка перед доавблением фичей
        # ================================================================
        data['begin'] = pd.to_datetime(data['begin'])
        data.sort_values(by='begin')
        
        
        # ================================================================
        # 2. Добавление фичей до выпрямления
        # ================================================================
        cols_before = data.columns
        data = add_timeseries_features(data, window_size_features)
        new_cols = [col for col in data.columns if col not in cols_before]
        ts_cols.extend(new_cols)
            
        # ================================================================
        # 4. Выпрямляем
        # ================================================================
        data = make_ml_dataset(
            data,
            ts_cols=ts_cols,
            single_cols=single_cols,
            window=window_size_ravel,
        )
        # ================================================================
        # 5. Добавляем новости
        # ================================================================
        begin = data['begin']
        end = data['end']
        
        def get_news_on_date(date, ticker):
            mask = df_news['publish_date'].dt.date == (date - pd.Timedelta(days=1)) # t-1 по требованиям
            news = df_news[mask] # TODO : Добавить фильтрацию по тикерам
            titles = news['title'].to_list()
            publications = news['publication'].to_list()
            return titles, publications
            
        f = lambda x : get_news_on_date(x, ticker)
        data[['titles', 'publications']] = (end.dt.date).apply(f).apply(pd.Series)
        
        # ================================================================
        # 5. Сохраняем в отдельный файл
        # ================================================================
        path = save_dir + f"{prefix}{ticker}.csv"
        data.to_csv(path)
        
        # ================================================================
        # 5. Сохраняем все
        # ================================================================
        data['ticker'] = ticker
        results.append(data)

    total = pd.concat(results, axis=0)
    path = save_dir + f'{prefix}total.csv'
    total.to_csv(path)



In [126]:
process_data(
    df=train_candles,
    df_news=train_news,
    save_dir='./data/processed/train/',
    window_size_features=5,
    window_size_ravel=5,
)

100%|██████████| 19/19 [01:25<00:00,  4.49s/it]
