<a href="https://colab.research.google.com/github/velblu/ML-datasets/blob/master/new_start.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Prerequisitions
Importing right packages..

In [2]:
!python --version

Python 3.9.16


In [3]:
!pip install icecream

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting icecream
  Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Collecting asttokens>=2.0.1
  Downloading asttokens-2.2.1-py2.py3-none-any.whl (26 kB)
Collecting executing>=0.3.1
  Downloading executing-1.2.0-py2.py3-none-any.whl (24 kB)
Collecting colorama>=0.3.9
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: executing, colorama, asttokens, icecream
Successfully installed asttokens-2.2.1 colorama-0.4.6 executing-1.2.0 icecream-2.1.3


In [4]:
# basics
import pandas
import pandas as pd
import numpy as np
import re
import pickle
from tqdm import tqdm # aby progress bar był dla pętli
# from magic-config import Config # do tworzenia config'ów (colab pro)
from typing import Tuple
from pydantic import BaseSettings, Field
from dataclasses import dataclass

# debug, loggin
from icecream import ic

# datetime
import datetime as dt
from dateutil import parser
from dateutil.parser import parse

#EDA reports
from pandas_profiling import ProfileReport
# import sweetviz as sv
# from autoviz.AutoViz_Class import AutoViz_Class

# sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.decomposition import PCA

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report, \
                            roc_auc_score, f1_score, precision_score, \
                            recall_score
from sklearn.model_selection import GridSearchCV, train_test_split, \
                                    KFold, StratifiedKFold

#xgboost
from xgboost import XGBClassifier

#hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, partial

# drawings
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

.. and mounting drive.. 

In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


.. and global configuration settings

In [6]:
class GlobalSettings(BaseSettings):
    log_level: str = "INFO"
    in_folder: str = 'drive/My Drive/blueOracle/'
    out_folder: str = 'drive/My Drive/blueOracle/out/'

    intervals: list = ['M15', 'M30', 'H01', 'D01', 'W01', 'M01']
    no_session_days = ['2015-01-01', '2015-01-06', '2015-04-03', '2015-04-06', '2015-05-01', '2015-06-04', '2015-11-11', '2015-12-24', '2015-12-25', '2015-12-31', 
                      '2016-01-01', '2016-01-06', '2016-03-25', '2016-03-28', '2016-05-03', '2016-05-26', '2016-08-15', '2016-11-01', '2016-11-11', '2016-12-26', 
                      '2017-01-06', '2017-04-14', '2017-04-17', '2017-05-01', '2017-05-03', '2017-06-15', '2017-08-15', '2017-11-01', '2017-12-25', '2017-12-26', 
                      '2018-01-01', '2018-01-02', '2018-03-30', '2018-04-02', '2018-05-01', '2018-05-03', '2018-05-31', '2018-08-15', '2018-11-01',  '2018-11-12', '2018-12-24', '2018-12-25', '2018-12-26', '2018-12-31', 
                      '2019-01-01', '2019-04-19', '2019-04-22', '2019-05-01', '2019-05-03', '2019-06-20', '2019-08-15', '2019-11-01', '2019-11-11', '2019-12-24', '2019-12-25', '2019-12-26', '2019-12-31', 
                      '2020-01-01', '2020-01-06', '2020-04-10', '2020-04-13', '2020-05-01', '2020-06-11', '2020-11-11', '2020-12-24', '2020-12-25', '2020-12-31', 
                      '2021-01-01', '2021-01-06', '2021-04-02', '2021-04-05', '2021-05-03', '2021-06-03', '2021-11-01', '2021-11-11', '2021-12-24', '2021-12-31', 
                      '2022-01-06', '2022-04-15', '2022-04-18', '2022-05-03', '2022-06-16', '2022-08-15', '2022-11-01', '2022-11-11', '2022-12-26', 
                      '2022-01-06', '2023-04-07', '2023-04-10', '2023-05-01', '2023-05-03', '2023-06-08', '2023-08-15', '2023-11-01', '2023-12-25', '2023-12-26']

settings = GlobalSettings()

#2. Preparing dataset
Lecimy w pewnej określonej kolejności, tzn utworzę funkcje (testując je na CDR),
a później utworzę funkcję która zaciągnie więcej danych (z innych indektów)


In [7]:
# downloading file with data 
df = pd.read_csv(settings.out_folder + 'asset_df_after_examination.csv', 
                       sep=';')

In [8]:
# 
interval = 'D01'
cols = ['D01', 'D01open', 'D01high', 'D01low', 'D01close',  'D01volume', 'D01_peak_fala', 'D01_auto_fala' ]
print(interval, cols)

df = df[cols]
df.info()

D01 ['D01', 'D01open', 'D01high', 'D01low', 'D01close', 'D01volume', 'D01_peak_fala', 'D01_auto_fala']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49632 entries, 0 to 49631
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   D01            49632 non-null  object 
 1   D01open        49632 non-null  float64
 2   D01high        49632 non-null  float64
 3   D01low         49632 non-null  float64
 4   D01close       49632 non-null  float64
 5   D01volume      49632 non-null  float64
 6   D01_peak_fala  49467 non-null  float64
 7   D01_auto_fala  49500 non-null  float64
dtypes: float64(7), object(1)
memory usage: 3.0+ MB


In [9]:
df.drop_duplicates(inplace=True, 
                   ignore_index=True)
df.head(10)

Unnamed: 0,D01,D01open,D01high,D01low,D01close,D01volume,D01_peak_fala,D01_auto_fala
0,2015-04-24,20.5,20.68,19.65,20.0,238045.0,,
1,2015-04-27,20.0,20.29,19.6,19.76,351917.0,,-2.0
2,2015-04-28,19.88,20.91,19.7,20.65,212433.0,1.0,1.0
3,2015-04-29,21.0,21.11,20.4,20.48,93365.0,2.0,1.0
4,2015-04-30,20.69,20.69,20.01,20.69,93782.0,-1.0,1.0
5,2015-05-04,20.75,21.11,20.75,21.0,125221.0,-1.0,1.0
6,2015-05-05,21.0,21.03,20.65,20.75,101422.0,-1.0,2.0
7,2015-05-06,20.8,20.92,20.4,20.65,141332.0,-1.0,-1.0
8,2015-05-07,20.51,20.6,20.3,20.45,93047.0,-2.0,-2.0
9,2015-05-08,20.6,21.0,20.45,20.74,186063.0,1.0,1.0


In [10]:
df.rename(columns={'D01open':'open',
                   'D01high':'high',
                   'D01low':'low',
                   'D01close':'close',
                   'D01volume':'volume',
                   'D01_peak_fala':'peak_fala',
                   'D01_auto_fala':'auto_fala',
                   'D01': 'interval'},
          inplace=True)

In [11]:
cols = ['open', 'high', 'low', 'close']
for col in cols:
  df[col] = df[col].mul(100).astype(int)

In [12]:
df['SecuritiesCode'] = 'CDR'
df

Unnamed: 0,interval,open,high,low,close,volume,peak_fala,auto_fala,SecuritiesCode
0,2015-04-24,2050,2068,1964,2000,238045.0,,,CDR
1,2015-04-27,2000,2029,1960,1976,351917.0,,-2.0,CDR
2,2015-04-28,1988,2091,1970,2065,212433.0,1.0,1.0,CDR
3,2015-04-29,2100,2111,2039,2048,93365.0,2.0,1.0,CDR
4,2015-04-30,2069,2069,2001,2069,93782.0,-1.0,1.0,CDR
...,...,...,...,...,...,...,...,...,...
1499,2021-04-26,17348,17390,16510,16898,689756.0,-2.0,-1.0,CDR
1500,2021-04-27,16600,17334,16600,16830,614501.0,1.0,-2.0,CDR
1501,2021-04-28,17000,17470,16860,17250,666270.0,,,CDR
1502,2021-04-29,17500,17700,17060,17524,524520.0,,,CDR


In [13]:
class PreviousPeriodsJoiner(BaseEstimator, TransformerMixin):
    """

    Transformer which adds ohlcv data from previous periods. 

    :param periods_num: number of previous periods to add to the set
    :param ohlcv_col: OHLCV choice to add
    """
    periods_num: int
    ohlcv_col: str

    def __init__(self, periods_num: int = 30, ohlcv_col: str = 'Close'):
        self.periods_num = periods_num
        self.ohlcv_col = ohlcv_col

    def fit(self, X: pandas.DataFrame, y=None):
        """
        Doing nothing.. 

        :param X: Dataset to work on
        :param y: Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`.
        :return: Fitted transformer.
        """
        return self

    def transform(self, X: pandas.DataFrame, y=None) -> pandas.DataFrame:
        """
        Adding values from previous periods.

        :param X: Dataset to work on
        :param y: Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`.
        :return: Input with dropped columns.
        """
        
        if self.periods_num > 0: 
          X_out = X.copy()

          #index of a ohlcv column
          index_no = X_out.columns.get_loc(self.ohlcv_col) 
          
          for period_num in range(self.periods_num, -1, -1):
            # name of new column
            col_name = self.ohlcv_col + '_' + str(period_num) 
            # values of new column
            col_to_add = X_out \
                  .groupby('SecuritiesCode')[self.ohlcv_col] \
                  .shift(period_num)
            # inserting new column                  
            X_out.insert(index_no, col_name, col_to_add)

            index_no += 1 # index for next new column has to change by 1
          
          return X_out    
        
        else:
          return X           

In [14]:
test_data = df.copy()
test_transformer = PreviousPeriodsJoiner(periods_num=5, ohlcv_col='high')
test_transformer = test_transformer.fit(X=test_data)
test_data = test_transformer.transform(X=test_data)
print(test_data.info())
print(test_data[['high_5', 'high_4', 'high_3', 'high_2','high_1','high_0', 'high']].loc[6])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1504 entries, 0 to 1503
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   interval        1504 non-null   object 
 1   open            1504 non-null   int64  
 2   high_5          1499 non-null   float64
 3   high_4          1500 non-null   float64
 4   high_3          1501 non-null   float64
 5   high_2          1502 non-null   float64
 6   high_1          1503 non-null   float64
 7   high_0          1504 non-null   int64  
 8   high            1504 non-null   int64  
 9   low             1504 non-null   int64  
 10  close           1504 non-null   int64  
 11  volume          1504 non-null   float64
 12  peak_fala       1499 non-null   float64
 13  auto_fala       1500 non-null   float64
 14  SecuritiesCode  1504 non-null   object 
dtypes: float64(8), int64(5), object(2)
memory usage: 176.4+ KB
None
high_5    2029.0
high_4    2091.0
high_3    2111.0
high_2  

In [17]:
# data preparation pipeline
data_preparation_pipeline = Pipeline(steps=[
    ('add_prev_high', PreviousPeriodsJoiner(periods_num=2, ohlcv_col='high')),
    ('add_prev_low', PreviousPeriodsJoiner(periods_num=2, ohlcv_col='low')),
    ('add_prev_open', PreviousPeriodsJoiner(periods_num=2, ohlcv_col='open')),
    ('add_prev_close', PreviousPeriodsJoiner(periods_num=2, ohlcv_col='close')),
    ('add_prev_volume', PreviousPeriodsJoiner(periods_num=2, ohlcv_col='volume'))
])

In [18]:
%%time
# .. and testing.. 
data = data_preparation_pipeline.fit_transform(X=df)
data.head(10)

CPU times: user 24 ms, sys: 0 ns, total: 24 ms
Wall time: 29.6 ms


Unnamed: 0,interval,open_2,open_1,open_0,open,high_2,high_1,high_0,high,low_2,...,close_1,close_0,close,volume_2,volume_1,volume_0,volume,peak_fala,auto_fala,SecuritiesCode
0,2015-04-24,,,2050,2050,,,2068,2068,,...,,2000,2000,,,238045.0,238045.0,,,CDR
1,2015-04-27,,2050.0,2000,2000,,2068.0,2029,2029,,...,2000.0,1976,1976,,238045.0,351917.0,351917.0,,-2.0,CDR
2,2015-04-28,2050.0,2000.0,1988,1988,2068.0,2029.0,2091,2091,1964.0,...,1976.0,2065,2065,238045.0,351917.0,212433.0,212433.0,1.0,1.0,CDR
3,2015-04-29,2000.0,1988.0,2100,2100,2029.0,2091.0,2111,2111,1960.0,...,2065.0,2048,2048,351917.0,212433.0,93365.0,93365.0,2.0,1.0,CDR
4,2015-04-30,1988.0,2100.0,2069,2069,2091.0,2111.0,2069,2069,1970.0,...,2048.0,2069,2069,212433.0,93365.0,93782.0,93782.0,-1.0,1.0,CDR
5,2015-05-04,2100.0,2069.0,2075,2075,2111.0,2069.0,2111,2111,2039.0,...,2069.0,2100,2100,93365.0,93782.0,125221.0,125221.0,-1.0,1.0,CDR
6,2015-05-05,2069.0,2075.0,2100,2100,2069.0,2111.0,2103,2103,2001.0,...,2100.0,2075,2075,93782.0,125221.0,101422.0,101422.0,-1.0,2.0,CDR
7,2015-05-06,2075.0,2100.0,2080,2080,2111.0,2103.0,2092,2092,2075.0,...,2075.0,2065,2065,125221.0,101422.0,141332.0,141332.0,-1.0,-1.0,CDR
8,2015-05-07,2100.0,2080.0,2051,2051,2103.0,2092.0,2060,2060,2065.0,...,2065.0,2045,2045,101422.0,141332.0,93047.0,93047.0,-2.0,-2.0,CDR
9,2015-05-08,2080.0,2051.0,2060,2060,2092.0,2060.0,2100,2100,2039.0,...,2045.0,2074,2074,141332.0,93047.0,186063.0,186063.0,1.0,1.0,CDR


In [28]:
X_out: pd.DataFrame = data.copy()

# określam budowę świec - bieżącej i poprzedzającej 
candles: pd.DataFrame = X_out['interval'].to_frame().copy()

for period_sufix in ['_1', '_0']:
  candles['cien_gorny'+period_sufix] = X_out['high'+period_sufix] 
  candles['korpus_max'+period_sufix] = np.where(
                    X_out['open'+period_sufix] > X_out['close'+period_sufix], 
                    X_out['open'+period_sufix], 
                    X_out['close'+period_sufix])
  candles['korpus_min'+period_sufix] = np.where(
                    X_out['open'+period_sufix] > X_out['close'+period_sufix], 
                    X_out['close'+period_sufix], 
                    X_out['open'+period_sufix])
  candles['cien_dolny'+period_sufix] = X_out['low'+period_sufix] 
candles.head()

Unnamed: 0,interval,cien_gorny_0,korpus_max_0,korpus_min_0,cien_dolny_0,cien_gorny_1,korpus_max_1,korpus_min_1,cien_dolny_1
0,2015-04-24,2068,2050,2000,1964,,,,
1,2015-04-27,2029,2000,1976,1960,2068.0,2050.0,2000.0,1964.0
2,2015-04-28,2091,2065,1988,1970,2029.0,2000.0,1976.0,1960.0
3,2015-04-29,2111,2100,2048,2039,2091.0,2065.0,1988.0,1970.0
4,2015-04-30,2069,2069,2069,2001,2111.0,2100.0,2048.0,2039.0


In [30]:
# Reguła  korpusów 
# wzrostowa: Jeśli w fali wzrostowej maksima korpusów są coraz wyżej 
# to fala się nie zmienia. 
candles['wzrost_regula_cieni'] = np.where(
                            candles['korpus_max_1'] < candles['korpus_max_0'],
                            True,
                            False)

# spadkowa: Jeśli w fali spadkowej minima korpusów są coraz niżej
# to fala się nie zmienia. 

candles['spadek_regula_cieni'] = np.where(
                            candles['korpus_min_1'] > candles['korpus_min_0'],
                            True,
                            False)
candles.head()

Unnamed: 0,interval,cien_gorny_0,korpus_max_0,korpus_min_0,cien_dolny_0,cien_gorny_1,korpus_max_1,korpus_min_1,cien_dolny_1,regula_cieni_wzrost,regula_cieni_spadek
0,2015-04-24,2068,2050,2000,1964,,,,,False,False
1,2015-04-27,2029,2000,1976,1960,2068.0,2050.0,2000.0,1964.0,False,True
2,2015-04-28,2091,2065,1988,1970,2029.0,2000.0,1976.0,1960.0,True,False
3,2015-04-29,2111,2100,2048,2039,2091.0,2065.0,1988.0,1970.0,True,False
4,2015-04-30,2069,2069,2069,2001,2111.0,2100.0,2048.0,2039.0,False,False


In [1]:
class WavesSetter(BaseEstimator, TransformerMixin):
    """
    Transformer określający fale średniego rzędu.

    Weryfikacja bieżącej i poprzedzającej świecy w celu ustalenia
    czy nastąpiła zmiana fali. 

    :param ma_size: Size of SMA and EMA
    :param ohlcv_col: Choice from OHLCV value
    """
    ohlcv_col: str
    ma_size: int

    def __init__(self):
        pass

    def fit(self, X: pandas.DataFrame, y=None):
        """
        Nic tutaj nie robię.. 

        :param X: Dataset to work on
        :param y: Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`.
        :return: Fitted transformer.
        """
        return self

    def transform(self, X: pandas.DataFrame, y=None) -> pandas.DataFrame:
        """
        Weryfikacja kolejnych reguł oraz określenie na ich podstawie fal. 

        :param X: Dataset to work on
        :param y: Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`.
        :return: Input with added column.
        """
        X_out: pd.DataFrame = X.copy()
        
        # określam budowę świec - bieżącej i poprzedzającej 
        candels: pd.DataFrame = X_out['interval'].copy()

        for period_sufix in ['_0', '_1']:
          candels['cien_gorny'+period_sufix] = X_out['high'+period_sufix] 
          candels['korpus_max'+period_sufix] = \
              np.where(X_out['open'+period_sufix] > X_out['close'+period_sufix], 
              X_out['open'+period_sufix], X_out['close'+period_sufix])
          candels['korpus_min'+period_sufix] = \
              np.where(X_out['open'+period_sufix] > X_out['close'+period_sufix], 
              X_out['close'+period_sufix], X_out['open'+period_sufix])
          candels['cien_dolny'+period_sufix] = X_out['low'+period_sufix] 



        # reguła korpusów 

        relation = pd.Series(None, dtype='float64')

        # choosing right columns - only choosen OHLCV from periods < ma_size
        cols = [column for column in X_out.columns 
                  if re.search(r''+re.escape(self.ohlcv_col)+'-', column) is not None]
        cols = [column for column in cols 
                  if int(re.search(r'(?<='+re.escape(self.ohlcv_col)+'-)\d+', column).group()) < self.ma_size]

        # calculating ema for Volume
        col_zero_name: str = self.ohlcv_col + '-0'

        ema = X_out[cols] \
              .transform(lambda row: row.ewm(self.ma_size, axis=1).mean())[col_zero_name] \
              .rename('ema')
        
        sma = X_out[cols] \
              .transform(lambda row: row.rolling(self.ma_size, axis=1).mean())[col_zero_name] \
              .rename('sma')

        relation = (ema - sma) / ema
        relation.rename(f'{self.ohlcv_col}_SMA{self.ma_size}vsEWM{self.ma_size}',
                             inplace=True)

        return X_out.join(relation)

NameError: ignored

In [42]:
def regula_korpusow_wzrostowa(swieca_1: dict,
                              swieca_0: dict) -> bool:

  """
  Reguła  korpusów - wzrostowa: Jeśli w fali wzrostowej maksima korpusów 
  są coraz wyżej to fala się nie zmienia. 

  :param swieca_0: Bieżąca świeca - jej ohlc w słowniku
  :param swieca_0: Poprzedzająca świeca - jej ohlc w słowniku
  :return: Fitted transformer.
  """                          

  return upBody_1 < upBody_0

In [None]:
def regula_korpusow_spadkowa(high_1: float,   #cien_gorny
                            upBody_1: float,  #korpus_max
                            lowBody_1: float, #korpus_min
                            low_1: float,     #cien_dolny
                            high_0: float,
                            upBody_0: float, 
                            lowBody_0: float, 
                            low_0: float) -> bool:

  """
  Reguła  korpusów - spadkowa: Jeśli w fali wzrostowej minima korpusów 
  są coraz niżej to fala się nie zmienia. 

  :param X: Dataset to work on
  :param y: Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`.
  :return: Fitted transformer.
  """                          

  return upBody_1 < upBody_0

In [15]:
p = 0 < 1
p

True

In [38]:
data['regula_cieni_w'] = 

Unnamed: 0,D01,D01open-30,D01open-29,D01open-28,D01open-27,D01open-26,D01open-25,D01open-24,D01open-23,D01open-22,...,D01volume-5,D01volume-4,D01volume-3,D01volume-2,D01volume-1,D01volume-0,D01volume,D01_peak_fala,D01_auto_fala,SecuritiesCode
0,2015-04-24,,,,,,,,,,...,,,,,,238045.0,238045.0,,,CDR
1,2015-04-27,,,,,,,,,,...,,,,,238045.0,351917.0,351917.0,,-2.0,CDR
2,2015-04-28,,,,,,,,,,...,,,,238045.0,351917.0,212433.0,212433.0,1.0,1.0,CDR
3,2015-04-29,,,,,,,,,,...,,,238045.0,351917.0,212433.0,93365.0,93365.0,2.0,1.0,CDR
4,2015-04-30,,,,,,,,,,...,,238045.0,351917.0,212433.0,93365.0,93782.0,93782.0,-1.0,1.0,CDR
5,2015-05-04,,,,,,,,,,...,238045.0,351917.0,212433.0,93365.0,93782.0,125221.0,125221.0,-1.0,1.0,CDR
6,2015-05-05,,,,,,,,,,...,351917.0,212433.0,93365.0,93782.0,125221.0,101422.0,101422.0,-1.0,2.0,CDR
7,2015-05-06,,,,,,,,,,...,212433.0,93365.0,93782.0,125221.0,101422.0,141332.0,141332.0,-1.0,-1.0,CDR
8,2015-05-07,,,,,,,,,,...,93365.0,93782.0,125221.0,101422.0,141332.0,93047.0,93047.0,-2.0,-2.0,CDR
9,2015-05-08,,,,,,,,,,...,93782.0,125221.0,101422.0,141332.0,93047.0,186063.0,186063.0,1.0,1.0,CDR
