In [1]:
import pandas as pd
import numpy as np
import re
import math
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("C:/Users/vladi/Documents/stocks_entry_prediction")
sys.path.append("C:/Users/vladi/Documents/stocks_entry_prediction/vtb_scorekit")

from vtb_scorekit.data import DataSamples
from vtb_scorekit.model import LogisticRegressionModel

from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

from main.calc_features import calc_features, prepare_columns, calc_targets
from main.constants import (
    FUTURES_NAME,
    SL_RUB,
    TP_RUB,
    TEST_SIZE,
    TRAIN_START,
    TRAIN_END,
    VAL_START,
    VAL_END,
    OOT_START,
    OOT_END
)

In [2]:
df = pd.read_excel("../src/stock_data.xlsx", sheet_name=FUTURES_NAME)

In [3]:
df = calc_features(
    df,
    FUTURES_NAME,
    SL_RUB,
    TP_RUB
)

In [4]:
df.drop(df.tail(1).index, inplace=True)
df = df.dropna().reset_index()

In [5]:
df.drop("TARGET_SHORT", axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,index,OPEN,HIGH,LOW,CLOSE,VOL,DATETIME,WEEK_DAY,HOUR,CANDLE_COLOR,...,END_CORRECTION_MA_std,END_CORRECTION_PERC_MA_mean,END_CORRECTION_PERC_MA_std,UPGOING_ACTIVE_IMPULSE_MA_mean,UPGOING_ACTIVE_IMPULSE_MA_std,DOWNGOING_ACTIVE_IMPULSE_MA_mean,DOWNGOING_ACTIVE_IMPULSE_MA_std,IS_SUPERACTIVE_IMPULSE_MA_mean,IS_SUPERACTIVE_IMPULSE_MA_std,TARGET_LONG
0,49,71063.0,71235.0,70978.0,71133.0,15591.0,2023-01-03 13:05:00,1,13,green,...,33.276,1.261,2.03,0.04,0.198,0.0,0.0,0.02,0.141,0
1,50,71135.0,71170.0,70993.0,71118.0,13342.0,2023-01-03 13:10:00,1,13,red,...,33.415,1.399,2.201,0.04,0.198,0.0,0.0,0.02,0.141,1
2,51,71114.0,71257.0,71087.0,71161.0,11886.0,2023-01-03 13:15:00,1,13,green,...,33.788,1.424,2.202,0.04,0.198,0.0,0.0,0.02,0.141,0
3,52,71161.0,71179.0,71106.0,71152.0,7354.0,2023-01-03 13:20:00,1,13,red,...,33.707,1.434,2.219,0.04,0.198,0.0,0.0,0.02,0.141,1
4,53,71158.0,71250.0,71130.0,71186.0,6147.0,2023-01-03 13:25:00,1,13,green,...,33.615,1.468,2.218,0.04,0.198,0.0,0.0,0.02,0.141,1


In [7]:
df.drop("index", axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,OPEN,HIGH,LOW,CLOSE,VOL,DATETIME,WEEK_DAY,HOUR,CANDLE_COLOR,PIVOT,...,END_CORRECTION_MA_std,END_CORRECTION_PERC_MA_mean,END_CORRECTION_PERC_MA_std,UPGOING_ACTIVE_IMPULSE_MA_mean,UPGOING_ACTIVE_IMPULSE_MA_std,DOWNGOING_ACTIVE_IMPULSE_MA_mean,DOWNGOING_ACTIVE_IMPULSE_MA_std,IS_SUPERACTIVE_IMPULSE_MA_mean,IS_SUPERACTIVE_IMPULSE_MA_std,TARGET_LONG
0,71063.0,71235.0,70978.0,71133.0,15591.0,2023-01-03 13:05:00,1,13,green,71115.336,...,33.276,1.261,2.03,0.04,0.198,0.0,0.0,0.02,0.141,0
1,71135.0,71170.0,70993.0,71118.0,13342.0,2023-01-03 13:10:00,1,13,red,71093.664,...,33.415,1.399,2.201,0.04,0.198,0.0,0.0,0.02,0.141,1
2,71114.0,71257.0,71087.0,71161.0,11886.0,2023-01-03 13:15:00,1,13,green,71168.336,...,33.788,1.424,2.202,0.04,0.198,0.0,0.0,0.02,0.141,0
3,71161.0,71179.0,71106.0,71152.0,7354.0,2023-01-03 13:20:00,1,13,red,71145.664,...,33.707,1.434,2.219,0.04,0.198,0.0,0.0,0.02,0.141,1
4,71158.0,71250.0,71130.0,71186.0,6147.0,2023-01-03 13:25:00,1,13,green,71188.664,...,33.615,1.468,2.218,0.04,0.198,0.0,0.0,0.02,0.141,1


In [9]:
print(df["DATETIME"].min())
print(df["DATETIME"].max())

2023-01-03 13:05:00
2024-08-30 23:40:00


In [10]:
df.loc[df["CANDLE_COLOR"] == "green", "CANDLE_COLOR"] = 1
df.loc[df["CANDLE_COLOR"] == "red", "CANDLE_COLOR"] = 0

In [11]:
train_df = df.loc[(
    (df["DATETIME"] >= TRAIN_START) &
    (df["DATETIME"] <= TRAIN_END)
)]

val_df = df.loc[(
    (df["DATETIME"] >= VAL_START) &
    (df["DATETIME"] <= VAL_END)
)]

df_oot = df.loc[(
    (df["DATETIME"] >= OOT_START) &
    (df["DATETIME"] <= OOT_END)
)]

# train_df = df.loc[df["DATETIME"] < OOT_START]

In [12]:
# x_train, x_test, y_train, y_test = train_test_split(
#     train_df.drop("TARGET_LONG", axis=1),
#     train_df["TARGET_LONG"],
#     test_size=TEST_SIZE,
#     shuffle=True
# )

x_train = train_df.drop("TARGET_LONG", axis=1)
y_train = train_df["TARGET_LONG"]
x_val = val_df.drop("TARGET_LONG", axis=1)
y_val = val_df["TARGET_LONG"]
x_oot = df_oot.drop("TARGET_LONG", axis=1)
y_oot = df_oot["TARGET_LONG"]

In [13]:
model = CatBoostClassifier(random_state=42)

model.fit(x_train, y_train)

Learning rate set to 0.056882
0:	learn: 0.6843669	total: 164ms	remaining: 2m 43s
1:	learn: 0.6761331	total: 179ms	remaining: 1m 29s
2:	learn: 0.6694898	total: 195ms	remaining: 1m 4s
3:	learn: 0.6633151	total: 212ms	remaining: 52.8s
4:	learn: 0.6580389	total: 228ms	remaining: 45.4s
5:	learn: 0.6527209	total: 244ms	remaining: 40.3s
6:	learn: 0.6478687	total: 259ms	remaining: 36.8s
7:	learn: 0.6443348	total: 274ms	remaining: 33.9s
8:	learn: 0.6409559	total: 289ms	remaining: 31.8s
9:	learn: 0.6383923	total: 304ms	remaining: 30.1s
10:	learn: 0.6355737	total: 319ms	remaining: 28.7s
11:	learn: 0.6327416	total: 335ms	remaining: 27.5s
12:	learn: 0.6307030	total: 352ms	remaining: 26.7s
13:	learn: 0.6289914	total: 369ms	remaining: 26s
14:	learn: 0.6275704	total: 385ms	remaining: 25.3s
15:	learn: 0.6262539	total: 401ms	remaining: 24.7s
16:	learn: 0.6248881	total: 416ms	remaining: 24.1s
17:	learn: 0.6236571	total: 432ms	remaining: 23.6s
18:	learn: 0.6226932	total: 448ms	remaining: 23.1s
19:	learn: 

<catboost.core.CatBoostClassifier at 0x1defd331850>

In [14]:
res_val = model.predict_proba(x_val)

In [15]:
val_df[["prediction_0", "prediction_1"]] = res_val

In [16]:
val_df.head()

Unnamed: 0,OPEN,HIGH,LOW,CLOSE,VOL,DATETIME,WEEK_DAY,HOUR,CANDLE_COLOR,PIVOT,...,END_CORRECTION_PERC_MA_std,UPGOING_ACTIVE_IMPULSE_MA_mean,UPGOING_ACTIVE_IMPULSE_MA_std,DOWNGOING_ACTIVE_IMPULSE_MA_mean,DOWNGOING_ACTIVE_IMPULSE_MA_std,IS_SUPERACTIVE_IMPULSE_MA_mean,IS_SUPERACTIVE_IMPULSE_MA_std,TARGET_LONG,prediction_0,prediction_1
54676,94500.0,94503.0,94437.0,94467.0,5633.0,2024-04-01 09:00:00,0,9,0,94469.0,...,197948.341,0.06,0.24,0.04,0.198,0.06,0.24,0,0.946,0.054
54677,94467.0,94474.0,94363.0,94365.0,4957.0,2024-04-01 09:05:00,0,9,0,94400.664,...,197948.341,0.04,0.198,0.06,0.24,0.04,0.198,1,0.855,0.145
54678,94364.0,94428.0,94344.0,94408.0,4344.0,2024-04-01 09:10:00,0,9,1,94393.336,...,197948.343,0.04,0.198,0.06,0.24,0.04,0.198,0,0.851,0.149
54679,94408.0,94433.0,94365.0,94422.0,2955.0,2024-04-01 09:15:00,0,9,1,94406.664,...,197948.34,0.04,0.198,0.06,0.24,0.04,0.198,0,0.889,0.111
54680,94419.0,94465.0,94408.0,94434.0,1949.0,2024-04-01 09:20:00,0,9,1,94435.664,...,197948.344,0.04,0.198,0.06,0.24,0.04,0.198,0,0.88,0.12


In [17]:
# нормируем на 0-100

val_df["score"] = (
    (val_df["prediction_1"] - val_df["prediction_1"].min()) /
    (val_df["prediction_1"].max() - val_df["prediction_1"].min())
) * 100

In [36]:
# считаем метрики по валидационной выборке
res_df = {
    "score cutoff": [],
    "precision": [],
    "recall": [],
    "tp": [],
    "fp": [],
    "fn": []
}

for i in range(1, 100, 2):
    tp = val_df.loc[
        (val_df["score"] >= i) &
        (val_df["TARGET_LONG"] == 1)
    ].shape[0]

    fp = val_df.loc[
        (val_df["score"] >= i) &
        (val_df["TARGET_LONG"] == 0)
    ].shape[0]

    fn = val_df.loc[
        (val_df["score"] < i) &
        (val_df["TARGET_LONG"] == 1)
    ].shape[0]

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    res_df["score cutoff"].append(i)
    res_df["precision"].append(precision)
    res_df["recall"].append(recall)
    res_df["tp"].append(tp)
    res_df["fp"].append(fp)
    res_df["fn"].append(fn)

In [37]:
res_df = pd.DataFrame(res_df)
res_df = res_df.sort_values(by="precision", ascending=False)

In [38]:
res_df.head()

Unnamed: 0,score cutoff,precision,recall,tp,fp,fn
48,97,0.654,0.005,17,9,3457
47,95,0.635,0.014,47,27,3427
46,93,0.534,0.02,70,61,3404
44,89,0.506,0.048,167,163,3307
45,91,0.502,0.033,115,114,3359
