<a href="https://colab.research.google.com/github/velblu/ML-datasets/blob/master/data_examination.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Prerequisitions
Importing right packages..

In [None]:
!python --version

Python 3.8.10


In [None]:
# basics
import pandas
import pandas as pd
import numpy as np
import re
import pickle
from tqdm import tqdm # aby progress bar był dla pętli
# from magic-config import Config # do tworzenia config'ów (colab pro)
from typing import Tuple
from pydantic import BaseSettings, Field

# datetime
import datetime as dt
from dateutil import parser
from dateutil.parser import parse

#EDA reports
from pandas_profiling import ProfileReport
# import sweetviz as sv
# from autoviz.AutoViz_Class import AutoViz_Class

# sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.decomposition import PCA

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report, \
                            roc_auc_score, f1_score, precision_score, \
                            recall_score
from sklearn.model_selection import GridSearchCV, train_test_split, \
                                    KFold, StratifiedKFold

#xgboost
from xgboost import XGBClassifier

#hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, partial

# drawings
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

.. and mounting drive.. 

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


.. and global configuration settings

In [None]:
class GlobalSettings(BaseSettings):
    log_level: str = "INFO"
    in_folder: str = 'drive/My Drive/blueOracle/'
    out_folder: str = 'drive/My Drive/blueOracle/out/'
    # na potrzeby obsługi indeksów
    intervals: list = ['M01','W01','D01','H01','M30','M15']
    ohlcv_cols = ['open', 'high', 'close', 'low', 'volume']
settings = GlobalSettings()

#2. Adding lacking data
Na początku dodatkowe dane :) 

In [None]:
class AddData(BaseSettings):

  # jakie interwały dla tego symbolu i jego dane

  # open, high, close, low, volume 
  d01 = [['2021-04-28', [170, 174.7, 172.5, 168.6, 666270]],
        ['2021-04-29', [175, 177, 175.24, 170.60, 524520]],
        ['2021-04-30', [175.76, 179.74, 174.12, 174, 480140]]]

  h01 = [['2021-02-08 10:00:00', []],
         ['2021-02-08 11:00:00', []], 
         ['2021-02-08 12:00:00', []], 
         ['2021-02-08 13:00:00', []],
         ['2021-02-08 14:00:00', []], 
         ['2021-02-08 15:00:00', []], 
         ['2021-02-08 16:00:00', []], 
         ['2021-02-08 17:00:00', []],

         ['2021-04-29 09:00:00', []], 
         ['2021-04-29 10:00:00', []], 
         ['2021-04-29 11:00:00', []], 
         ['2021-04-29 12:00:00', []], 
         ['2021-04-29 13:00:00', []], 
         ['2021-04-29 14:00:00', []], 
         ['2021-04-29 15:00:00', []], 
         ['2021-04-29 16:00:00', []], 
         ['2021-04-29 17:00:00', []],

         ['2021-04-30 09:00:00', []], 
         ['2021-04-30 10:00:00', []], 
         ['2021-04-30 11:00:00', []], 
         ['2021-04-30 12:00:00', []], 
         ['2021-04-30 13:00:00', []], 
         ['2021-04-30 14:00:00', []], 
         ['2021-04-30 15:00:00', []],
         ['2021-04-30 16:00:00', []], 
         ['2021-04-30 17:00:00', []]]

  intervals_data = [('D01', d01)]

add_data = AddData()

In [None]:
# downloading file with data 
df_all = pd.read_csv(settings.out_folder + 'asset_df.csv', 
                       sep=';')

# 3. Weryfikacja danych

In [None]:
# df_all.set_index(keys=settings.intervals, inplace=True)
#to robię później - po sprawdzeniu liczebności dla ohclv

Sprawdzenie typów danych

In [None]:
for interval in settings.intervals:
  cols = [column for column in df_all.columns if column.startswith(interval)]
  print(interval, cols)

  df_all[cols].info()

M01 ['M01', 'M01open', 'M01high', 'M01low', 'M01close', 'M01volume', 'M01_peak_fala', 'M01_auto_fala', 'M01_SzfBear_EveningStar_szf', 'M01_SzfBear_EveningStar_negated', 'M01_SzfBear_EveningStar_formationMin', 'M01_SzfBear_EngulfingTop_szf', 'M01_SzfBear_EngulfingTop_negated', 'M01_SzfBear_EngulfingTop_formationMin', 'M01_SzfBear_BearishEngulfing_szf', 'M01_SzfBear_BearishEngulfing_negated', 'M01_SzfBear_BearishEngulfing_formationMin', 'M01_SzfBear_DarkCloudCover_szf', 'M01_SzfBear_DarkCloudCover_negated', 'M01_SzfBear_DarkCloudCover_formationMin', 'M01_SzfBear_ShootingStar_szf', 'M01_SzfBear_ShootingStar_negated', 'M01_SzfBear_ShootingStar_formationMin', 'M01_SzfBear_Harami_szf', 'M01_SzfBear_Harami_negated', 'M01_SzfBear_Harami_formationMin', 'M01_SzfBear_InvertedHammer_szf', 'M01_SzfBear_InvertedHammer_negated', 'M01_SzfBear_InvertedHammer_formationMin', 'M01_SzfBear_Hammer_szf', 'M01_SzfBear_Hammer_negated', 'M01_SzfBear_Hammer_formationMin', 'M01_SzfBear_HangingMan_szf', 'M01_SzfBe

Dzielę zatem zestaw danych na 3 części: 
1. OHLCV ze wszystkimi elementami
2. fale 
3. Sygnały Zmiany Fali, podzielone znowu na dwie części: Bear i Bull. 

## 3.1 OHLCV

### 3.1.1 Braki

Czyli sprawdzam braki w danych

In [None]:
# sprawdzenie gdzie są braki 
for interval in settings.intervals:
  df = df_all[[interval, interval+'open']]. \
                  copy()               
  
  empty_fields_list = df[df[interval+'open'].isnull()][interval]. \
                                  drop_duplicates(). \
                                  to_list()

  print(f'Interwał: {interval}, ilość: {len(empty_fields_list)}')
  print(f'Daty: {empty_fields_list}')
  print()

Interwał: M01, ilość: 0
Daty: []

Interwał: W01, ilość: 0
Daty: []

Interwał: D01, ilość: 3
Daty: ['2021-04-28', '2021-04-29', '2021-04-30']

Interwał: H01, ilość: 26
Daty: ['2021-02-08 10:00:00', '2021-02-08 11:00:00', '2021-02-08 12:00:00', '2021-02-08 13:00:00', '2021-02-08 14:00:00', '2021-02-08 15:00:00', '2021-02-08 16:00:00', '2021-02-08 17:00:00', '2021-04-29 09:00:00', '2021-04-29 10:00:00', '2021-04-29 11:00:00', '2021-04-29 12:00:00', '2021-04-29 13:00:00', '2021-04-29 14:00:00', '2021-04-29 15:00:00', '2021-04-29 16:00:00', '2021-04-29 17:00:00', '2021-04-30 09:00:00', '2021-04-30 10:00:00', '2021-04-30 11:00:00', '2021-04-30 12:00:00', '2021-04-30 13:00:00', '2021-04-30 14:00:00', '2021-04-30 15:00:00', '2021-04-30 16:00:00', '2021-04-30 17:00:00']

Interwał: M30, ilość: 55
Daty: ['2016-07-05 10:30:00', '2016-07-05 13:00:00', '2016-10-21 14:00:00', '2016-11-08 12:00:00', '2019-04-23 16:30:00', '2021-02-08 09:30:00', '2021-02-08 10:00:00', '2021-02-08 10:30:00', '2021-02-08

Dodaję dane .. 

In [None]:
for interval_data in add_data.intervals_data:
  interval = interval_data[0] # symbol interwału
  
  cols = [interval + col_name for col_name in settings.ohlcv_cols]
  cols2 = [interval + col_name for col_name in settings.ohlcv_cols+['']]
  # nadpisuję dla konkretnego symbolu dane ohlcv w konkretnym interwale
  for ohlcv_values in interval_data[1]:
    for col, value in zip(cols, ohlcv_values[1]):
      print(interval, ohlcv_values[0], col, value)
      df_all.loc[df_all[interval] == ohlcv_values[0],col] = value
  
  # sprawdzenie 
    print()
    print(df_all[df_all[interval] == ohlcv_values[0]][cols2])
    print()

D01 2021-04-28 D01open 170
D01 2021-04-28 D01high 174.7
D01 2021-04-28 D01close 172.5
D01 2021-04-28 D01low 168.6
D01 2021-04-28 D01volume 666270

       D01open  D01high  D01close  D01low  D01volume         D01
49533    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49534    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49535    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49536    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49537    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49538    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49539    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49540    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49541    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49542    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49543    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49544    170.0    174.7     172.5   168.6   666270.0  2021-04-28
49545   

Weryfikacja wyników

In [None]:
for interval in settings.intervals:
  df = df_all[[interval, interval+'open']]. \
                  copy()               
  
  empty_fields_list = df[df[interval+'open'].isnull()][interval]. \
                                  drop_duplicates(). \
                                  to_list()

  print(f'Interwał: {interval}, ilość: {len(empty_fields_list)}')
  print(f'Daty: {empty_fields_list}')
  print()

Interwał: M01, ilość: 0
Daty: []

Interwał: W01, ilość: 0
Daty: []

Interwał: D01, ilość: 0
Daty: []

Interwał: H01, ilość: 26
Daty: ['2021-02-08 10:00:00', '2021-02-08 11:00:00', '2021-02-08 12:00:00', '2021-02-08 13:00:00', '2021-02-08 14:00:00', '2021-02-08 15:00:00', '2021-02-08 16:00:00', '2021-02-08 17:00:00', '2021-04-29 09:00:00', '2021-04-29 10:00:00', '2021-04-29 11:00:00', '2021-04-29 12:00:00', '2021-04-29 13:00:00', '2021-04-29 14:00:00', '2021-04-29 15:00:00', '2021-04-29 16:00:00', '2021-04-29 17:00:00', '2021-04-30 09:00:00', '2021-04-30 10:00:00', '2021-04-30 11:00:00', '2021-04-30 12:00:00', '2021-04-30 13:00:00', '2021-04-30 14:00:00', '2021-04-30 15:00:00', '2021-04-30 16:00:00', '2021-04-30 17:00:00']

Interwał: M30, ilość: 55
Daty: ['2016-07-05 10:30:00', '2016-07-05 13:00:00', '2016-10-21 14:00:00', '2016-11-08 12:00:00', '2019-04-23 16:30:00', '2021-02-08 09:30:00', '2021-02-08 10:00:00', '2021-02-08 10:30:00', '2021-02-08 11:00:00', '2021-02-08 11:30:00', '2021

### 3.1.2 Wartości

In [None]:
df_all.set_index(keys=settings.intervals, inplace=True)

In [None]:
# zestaw nazw kolumn dla interwału
interval = 'D01'
ohlcv_cols = [interval + ohlc_col for ohlc_col in settings.ohlcv_cols]
print(ohlcv_cols)

['D01open', 'D01high', 'D01close', 'D01low', 'D01volume']


In [None]:
print(f'Intervał: {interval}')
for col in ohlcv_cols:
  print(f'Kolumna {col}')
  print(df_all[col].describe().apply("{0:.1f}".format))
  print(f'Liczę nulle: {df_all[col].isnull().sum()}')

Intervał: D01
Kolumna D01open
count    49632.0
mean       155.8
std        118.9
min         19.9
25%         40.3
50%        124.1
75%        235.2
max        455.3
Name: D01open, dtype: object
Liczę nulle: 0
Kolumna D01high
count    49632.0
mean       158.7
std        121.3
min         20.3
25%         41.0
50%        126.5
75%        239.0
max        464.2
Name: D01high, dtype: object
Liczę nulle: 0
Kolumna D01close
count    49632.0
mean       155.7
std        118.8
min         19.8
25%         40.2
50%        124.8
75%        235.1
max        460.8
Name: D01close, dtype: object
Liczę nulle: 0
Kolumna D01low
count    49632.0
mean       152.7
std        116.5
min         19.6
25%         39.6
50%        122.2
75%        231.4
max        443.8
Name: D01low, dtype: object
Liczę nulle: 0
Kolumna D01volume
count      49632.0
mean      361493.8
std       468108.7
min        21658.0
25%       158858.8
50%       249197.0
75%       405038.8
max      6857539.0
Name: D01volume, dtype: object
L

In [None]:
for interval in settings.intervals:
  cols = [column for column in df_all.columns if column.startswith(interval)]
  print(interval, cols)

  df_all[cols].info()

M01 ['M01open', 'M01high', 'M01low', 'M01close', 'M01volume', 'M01_peak_fala', 'M01_auto_fala', 'M01_SzfBear_EveningStar_szf', 'M01_SzfBear_EveningStar_negated', 'M01_SzfBear_EveningStar_formationMin', 'M01_SzfBear_EngulfingTop_szf', 'M01_SzfBear_EngulfingTop_negated', 'M01_SzfBear_EngulfingTop_formationMin', 'M01_SzfBear_BearishEngulfing_szf', 'M01_SzfBear_BearishEngulfing_negated', 'M01_SzfBear_BearishEngulfing_formationMin', 'M01_SzfBear_DarkCloudCover_szf', 'M01_SzfBear_DarkCloudCover_negated', 'M01_SzfBear_DarkCloudCover_formationMin', 'M01_SzfBear_ShootingStar_szf', 'M01_SzfBear_ShootingStar_negated', 'M01_SzfBear_ShootingStar_formationMin', 'M01_SzfBear_Harami_szf', 'M01_SzfBear_Harami_negated', 'M01_SzfBear_Harami_formationMin', 'M01_SzfBear_InvertedHammer_szf', 'M01_SzfBear_InvertedHammer_negated', 'M01_SzfBear_InvertedHammer_formationMin', 'M01_SzfBear_Hammer_szf', 'M01_SzfBear_Hammer_negated', 'M01_SzfBear_Hammer_formationMin', 'M01_SzfBear_HangingMan_szf', 'M01_SzfBear_Hang