In [None]:
!pip install pandas_market_calendars
!pip install Random-Word-Generator
!pip install RandomWord



In [None]:
import numpy as np
import pandas as pd
import pandas_market_calendars as mcal
from RandomWordGenerator import RandomWord

import tensorflow as tf
from sklearn.svm import SVC

from scipy.stats import rankdata
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix


import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PATH = '/content/drive/My Drive/app/NLP/'

### 1. Определение временных параметров модели.

* Определение глубины истории для модели - lookback 
* Определение шага пересчета коэффициентов модели - step

In [None]:
nyse = mcal.get_calendar('NYSE')

lookback = nyse.schedule(start_date='2016-01-01', end_date='2018-12-31').shape[0]
print(f'lookback = {lookback}')

calc_dates = ['2019-01-01', '2019-04-01', '2019-07-01', '2019-09-30',
              '2020-01-01', '2020-04-01', '2020-07-01', '2020-09-30']
                
qtr_lengths = []
for i in range(1, len(calc_dates)):
  qtr_lengths.append(nyse.schedule(calc_dates[i - 1], calc_dates[i]).shape[0])

step = int(np.array(qtr_lengths).mean())
print(f'calculation step = {step}')

lookback = 754
calculation step = 63


In [None]:
data_days = nyse.schedule('2016-01-01', '2021-01-05').shape[0]
out_of_sample_days = nyse.schedule('2021-01-06', '2021-03-05').shape[0]
print(f'число дней для тренировки и валидации модели = {data_days}')
print(f'число дней для тестирования модели = {out_of_sample_days}')

число дней для тренировки и валидации модели = 1261
число дней для тестирования модели = 41


In [None]:
start_day, end_day = 20160101, 20210305

### 2. Загрузка и отбор данных в соответствии с заданными временными параметрами.

Формирование 3D данных.

In [None]:
file_list = ['Open.csv', 'High.csv', 'Low.csv' ,'Close.csv', 'Volume.csv']

for i, file in enumerate(file_list):
  df = pd.read_csv(PATH + file, index_col=0)#.fillna(method='ffill', axis=1)    # заполнение пропусков
  df.columns = [int(i.replace('-', '')) for i in df.columns]
  df = df.loc[:, (df.columns > start_day) & (df.columns < end_day)].fillna(method='ffill', axis=1)
  if i == 0:
    dates = df.columns
    instruments = df.index
    data_3D = np.full((df.shape[0], df.shape[1], len(file_list) + 2), np.nan)

  data_3D[:, :, i] = df

data_3D[:, :, 5] = (data_3D[:, :, 1] - data_3D[:, :, 2]) /  data_3D[:, :, 0]   # (High - Low) / Open
data_3D[:, :, 6] = (data_3D[:, :, 0] - data_3D[:, :, 3]) /  data_3D[:, :, 0]   # (Open - Close) / Open

print(f'data_3D.shape = {data_3D.shape}')

data_3D.shape = (492, 1302, 7)


Пример одного слоя данных - данные на конец дня (из Close.csv)

In [None]:
df = pd.DataFrame(data_3D[:, :, 3], index=instruments, columns=dates)
print(f'df.shape = {df.shape}')
df.head(2)

df.shape = (492, 1302)


Unnamed: 0,20160104,20160105,20160106,20160107,20160108,20160111,20160112,20160113,20160114,20160115,20160119,20160120,20160121,20160122,20160125,20160126,20160127,20160128,20160129,20160201,20160202,20160203,20160204,20160205,20160208,20160209,20160210,20160211,20160212,20160216,20160217,20160218,20160219,20160222,20160223,20160224,20160225,20160226,20160227,20160229,...,20210106,20210107,20210108,20210111,20210112,20210113,20210114,20210115,20210119,20210120,20210121,20210122,20210125,20210126,20210127,20210128,20210129,20210201,20210202,20210203,20210204,20210205,20210208,20210209,20210210,20210211,20210212,20210216,20210217,20210218,20210219,20210222,20210223,20210224,20210225,20210226,20210301,20210302,20210303,20210304
MMM,146.82,147.46,144.49,140.97,140.49,140.46,140.86,138.72,141.18,138.69,138.39,136.96,137.76,139.52,137.57,144.78,145.55,147.32,151.0,148.73,147.87,152.52,153.44,153.47,153.9,154.74,152.45,150.67,153.96,155.53,156.62,156.78,156.45,157.7,156.17,157.22,158.99,158.27,158.27,156.87,...,174.19,169.72,166.62,165.2,166.51,166.28,166.41,165.55,169.12,170.22,170.67,169.04,170.39,175.95,186.65,183.42,175.66,174.7,175.02,175.48,177.1,179.01,180.76,180.94,181.08,177.21,178.7,176.63,176.65,179.37,176.54,176.12,176.17,177.63,178.76,175.06,176.41,177.11,178.06,177.63
ABT,42.93,42.92,42.56,41.54,40.67,40.73,41.46,40.28,41.1,40.54,40.43,39.96,39.48,40.03,39.66,40.16,40.47,36.71,37.85,38.45,37.88,38.02,38.01,37.41,37.46,37.0,37.21,36.34,37.13,37.99,38.45,38.31,38.53,39.16,38.39,39.2,39.63,39.52,39.52,38.74,...,110.23,111.3,111.61,110.84,108.84,112.15,110.04,111.3,112.57,112.89,112.95,112.84,114.72,114.73,114.29,120.39,123.59,122.54,120.24,120.22,119.74,124.03,124.5,125.15,125.29,126.84,128.23,128.02,127.24,125.41,123.04,122.55,120.8,122.38,121.58,119.78,122.21,122.53,119.18,116.01


#### Генерация названий инструментов (заменить реальными)

In [None]:
np.random.seed(25)

n_instruments = len(instruments)
rw = RandomWord(max_word_size=5, constant_word_size=True)
instruments_name_list = rw.getList(n_instruments)
instruments_name_list[:5]

['NsimV', 'SxbXD', 'BVQhi', 'WgTgV', 'mkorP']

In [None]:
name_dict = {instruments[i]: instruments_name_list[i] for i in range(len(instruments))}
name_dict['NI'] 

'YRfZF'

### 3. Трансформация данных

#### Отсутствуют данные на начало периода:

In [None]:
no_data = np.isnan(data_3D[:, 0, 0])
instr_no_data = instruments[no_data].to_list()
for i, instr in enumerate(instr_no_data):
  date = np.isnan(data_3D[no_data, :, 0][i]).sum()
  print(f'{instr}: \t {dates[date - 1]}')

AMCR: 	 20190611
CTVA: 	 20190524
FTV: 	 20160613
FOXA: 	 20190312
FOX: 	 20190313
LW: 	 20161102
LIN: 	 20161005


In [None]:
data_3D = data_3D[~no_data, :, :]
instruments = instruments[~no_data]
print(f'data_3D.shape = {data_3D.shape}, instruments.shape = {instruments.shape}')

data_3D.shape = (485, 1302, 7), instruments.shape = (485,)


Содержание слоев:
* Данные первых 4 слоев (Open, High, Low, Close) приводим к состоянию log(x(t) / x(t-2)) - темпы роста / падения Open, High, Low, Close
* Данные 5 слоя (Volume) берутся без изменений.
* Данные 6 и 7 слоев - (High - Low) / Open и (Open - Close) / Open

In [None]:
data_transformed_3D = np.full((data_3D.shape), np.nan, np.float32)
for i in range(2, data_3D.shape[1]):  
  data_transformed_3D[:, i, :4] = np.log(data_3D[:, i, :4] / (data_3D[:, i -2, :4]))
  
data_transformed_3D[:, :, 4:] = data_3D[:, :, 4:]

data_transformed_3D[:2, :5, 3]

array([[        nan,         nan, -0.01599705, -0.04500985, -0.02807399],
       [        nan,         nan, -0.00865604, -0.0326811 , -0.04542413]],
      dtype=float32)

### 4. Данные для тренировки модели

In [None]:
def get_y_class(y, enforce_class=False):
  y_class = np.zeros(y.shape)
  y_class[y > 0] = 1

  if enforce_class:              # на случай, если будут все 0 или 1
    y_class[y.argmax()] = 1
    y_class[y.argmin()] = 0
  
  return y_class

In [None]:
train_set_3D = {}
start_ind = 2
for i in range(1, 10):
  if i == 9:
    print('\nOut of sample:')
  end_ind = start_ind + lookback

  X_train = data_transformed_3D[:, start_ind:end_ind, :]
  y_train = get_y_class(data_transformed_3D[:, end_ind + 1, 3])     # y берется из 4 слоя (Close)
  train_set_3D[i] = (X_train, y_train)

  print(f"{i}: X_train = data_transformed_3D[:, {start_ind}:{end_ind}, :], y_train = get_y_class(data_transformed_3D[:, {end_ind + 1}, 3])")    
  start_ind += step 

print(f'\ndata_days = {data_days}')
print(f'out_of_sample_days = {out_of_sample_days}')

1: X_train = data_transformed_3D[:, 2:756, :], y_train = get_y_class(data_transformed_3D[:, 757, 3])
2: X_train = data_transformed_3D[:, 65:819, :], y_train = get_y_class(data_transformed_3D[:, 820, 3])
3: X_train = data_transformed_3D[:, 128:882, :], y_train = get_y_class(data_transformed_3D[:, 883, 3])
4: X_train = data_transformed_3D[:, 191:945, :], y_train = get_y_class(data_transformed_3D[:, 946, 3])
5: X_train = data_transformed_3D[:, 254:1008, :], y_train = get_y_class(data_transformed_3D[:, 1009, 3])
6: X_train = data_transformed_3D[:, 317:1071, :], y_train = get_y_class(data_transformed_3D[:, 1072, 3])
7: X_train = data_transformed_3D[:, 380:1134, :], y_train = get_y_class(data_transformed_3D[:, 1135, 3])
8: X_train = data_transformed_3D[:, 443:1197, :], y_train = get_y_class(data_transformed_3D[:, 1198, 3])

Out of sample:
9: X_train = data_transformed_3D[:, 506:1260, :], y_train = get_y_class(data_transformed_3D[:, 1261, 3])

data_days = 1261
out_of_sample_days = 41


#### Данные для обучения одной модели (из девяти).

In [None]:
(X_train, y_train) = train_set_3D[1]
print(f'X_train.shape = {X_train.shape}, y_train.shape = {y_train.shape}')

np.unique(y_train, return_counts=True)

X_train.shape = (485, 754, 7), y_train.shape = (485,)


(array([0., 1.]), array([101, 384]))

### 5. Данные для предсказаний на основе обученной модели

* После рассчета коэффициентов модели[1], например, предсказываются у_val на каждый день следующего квартала. 
* Эта процедура повторяется после каждого (из 9) обучения модели с той только разницей, что out_of_sample_days у нас только 40, а не 63.

In [None]:
val_set_3D = {}
start_ind = 2
step_to_use = step
for i in range(1, 10):  
  model_val_set = {}
  if i == 9:
    print('\nOut of sample:')
    step_to_use = out_of_sample_days - 1
  else:
    print(f'\nДанные для прогнозов по модели {i}')   
  for j in range(step_to_use):
    X_val = data_transformed_3D[:, start_ind + j : start_ind + j + lookback, :]        
    y_val = get_y_class(data_transformed_3D[:, start_ind + j + lookback + 1, 3])   # y берется из 4 слоя (Close)
    model_val_set[j + 1] = (X_val, y_val)    
    print(f"{j + 1}: X_val = data_transformed_3D[:, {start_ind + j}:{start_ind + j+lookback}, :], y_val = get_y_class(data_transformed_3D[:, {start_ind+ j+lookback + 1}, 3])")     
  start_ind += step
  val_set_3D[i] = model_val_set

print(f'\ndata_transformed[:, 2:].shape = {data_transformed_3D[:, 2:, :].shape}')


Данные для прогнозов по модели 1
1: X_val = data_transformed_3D[:, 2:756, :], y_val = get_y_class(data_transformed_3D[:, 757, 3])
2: X_val = data_transformed_3D[:, 3:757, :], y_val = get_y_class(data_transformed_3D[:, 758, 3])
3: X_val = data_transformed_3D[:, 4:758, :], y_val = get_y_class(data_transformed_3D[:, 759, 3])
4: X_val = data_transformed_3D[:, 5:759, :], y_val = get_y_class(data_transformed_3D[:, 760, 3])
5: X_val = data_transformed_3D[:, 6:760, :], y_val = get_y_class(data_transformed_3D[:, 761, 3])
6: X_val = data_transformed_3D[:, 7:761, :], y_val = get_y_class(data_transformed_3D[:, 762, 3])
7: X_val = data_transformed_3D[:, 8:762, :], y_val = get_y_class(data_transformed_3D[:, 763, 3])
8: X_val = data_transformed_3D[:, 9:763, :], y_val = get_y_class(data_transformed_3D[:, 764, 3])
9: X_val = data_transformed_3D[:, 10:764, :], y_val = get_y_class(data_transformed_3D[:, 765, 3])
10: X_val = data_transformed_3D[:, 11:765, :], y_val = get_y_class(data_transformed_3D[:, 76

#### Данные для одного предсказания (из 63) одной модели (из девяти).

In [None]:
model_ind = 2       # из 9
prediction_ind = 1  # из 63

(X_val, y_val) = val_set_3D[model_ind][prediction_ind]
print(f'X_val.shape = {X_val.shape}, y_val.shape = {y_val.shape}')

X_val.shape = (485, 754, 7), y_val.shape = (485,)


### 6. Модель

In [None]:
def build_model():
  model = SVC(kernel='poly', gamma='auto', probability=True, random_state=24)

  return model

### 7. Получение прогнозов за весь период наблюдений

In [None]:
n_predictions = data_days - lookback - 2 - 2 + out_of_sample_days 
n_instruments = len(instruments)
n_levels = data_3D.shape[2]

predictions_3D = np.full((n_instruments, n_predictions, n_levels), np.nan)

In [None]:
def get_level_predictions(level):
  predictions = np.full((n_instruments, n_predictions), np.nan)
  step_to_use = step
  n = 0
  for i in range(1, 10):
    (X_train_3D, y_train) = train_set_3D[i]
    X_train = X_train_3D[:, :, level]
    model = build_model()  

    model.fit(X_train, y_train)

    if i == 9: 
      step_to_use = out_of_sample_days - 1

    for j in range(1, step_to_use + 1):
      (X_val_3D, y_val) = val_set_3D[i][j]
      X_val =  X_val_3D[:, :, level]   
      y_pred = model.predict_proba(X_val)[:, 1].flatten()    
      predictions[:, n] = y_pred
          
      n += 1

  return predictions    

In [None]:
y_true_matrix = get_y_class(data_transformed_3D[:, -n_predictions:, 3])
y_true_matrix.shape

(485, 544)

In [None]:
def get_prediction_class(y, cut_off_level):
  y_pred = np.zeros(y.shape)
  y_pred[y > cut_off_level] = 1

  return y_pred


def get_accuracy_by_date(predictions, cut_off_level=0.5):
  accuracies = []
  for i in range(y_true_matrix.shape[1]):
    y_true = y_true_matrix[:, i]
    y_pred = get_prediction_class(predictions[:, i], cut_off_level)
    accuracies.append(accuracy_score(y_true, y_pred))

  accuracies = pd.DataFrame([dates[-n_predictions:], accuracies], index=['dates', 'accuracy']).T.set_index('dates')
  accuracies.index = accuracies.index.astype(int)

  mean = round(accuracies['accuracy'].mean(), 3)
  std = round(accuracies['accuracy'].std(), 3)
  print(f'accuracy by date = {mean} +- {std}')

  return accuracies.sort_values(by=['accuracy'], ascending=False)


def get_accuracy_by_instr(predictions, cut_off_level=0.5):
  accuracies = []
  for i in range(y_true_matrix.shape[0]):
    y_true = y_true_matrix[i, :]
    y_pred = get_prediction_class(predictions[i, :], cut_off_level)
    accuracies.append(accuracy_score(y_true, y_pred))

  accuracies = pd.DataFrame([instruments, accuracies], index=['instruments', 'accuracy']).T.set_index('instruments')  

  mean = round(accuracies['accuracy'].mean(), 3)
  std = round(accuracies['accuracy'].std(), 3)
  print(f'accuracy by instrument = {mean} +- {std}')

  return accuracies.sort_values(by=['accuracy'], ascending=False)

#### Уровень 0: 'Open'

In [None]:
level = 0

predictions = get_level_predictions(level)
predictions_3D[:, :, level] = predictions

In [None]:
cut_off_level = 0.41
accuracy_by_date = get_accuracy_by_date(predictions, cut_off_level)
accuracy_by_instrument = get_accuracy_by_instr(predictions, cut_off_level)

accuracy by date = 0.55 +- 0.235
accuracy by instrument = 0.55 +- 0.025


In [None]:
accuracy_by_date.head()

Unnamed: 0_level_0,accuracy
dates,Unnamed: 1_level_1
20200312,1.0
20200225,0.989691
20200630,0.981443
20200309,0.979381
20200227,0.973196


In [None]:
accuracy_by_instrument.head()

Unnamed: 0_level_0,accuracy
instruments,Unnamed: 1_level_1
TRV,0.604779
CDNS,0.599265
C,0.597426
ABBV,0.59375
FANG,0.59375


#### Уровень 1: 'High'

In [None]:
level = 1

predictions = get_level_predictions(level)
predictions_3D[:, :, level] = predictions

In [None]:
cut_off_level = 0.41
accuracy_by_date = get_accuracy_by_date(predictions, cut_off_level)
accuracy_by_instrument = get_accuracy_by_instr(predictions, cut_off_level)

accuracy by date = 0.55 +- 0.235
accuracy by instrument = 0.55 +- 0.025


In [None]:
accuracy_by_date.head()

Unnamed: 0_level_0,accuracy
dates,Unnamed: 1_level_1
20200312,1.0
20200225,0.989691
20200630,0.981443
20200309,0.979381
20200227,0.973196


In [None]:
accuracy_by_instrument.head()

Unnamed: 0_level_0,accuracy
instruments,Unnamed: 1_level_1
CDNS,0.619485
MTD,0.617647
PKI,0.606618
UAA,0.604779
LOW,0.604779


#### Уровень 2: 'Low'

In [None]:
level = 2
 
predictions = get_level_predictions(level)
predictions_3D[:, :, level] = predictions

In [None]:
cut_off_level = 0.41
accuracy_by_date = get_accuracy_by_date(predictions, cut_off_level)
accuracy_by_instrument = get_accuracy_by_instr(predictions, cut_off_level)

accuracy by date = 0.55 +- 0.235
accuracy by instrument = 0.55 +- 0.025


In [None]:
accuracy_by_date.head()

Unnamed: 0_level_0,accuracy
dates,Unnamed: 1_level_1
20200312,1.0
20200225,0.989691
20200630,0.981443
20200309,0.979381
20200227,0.973196


In [None]:
accuracy_by_instrument.head()

Unnamed: 0_level_0,accuracy
instruments,Unnamed: 1_level_1
CDNS,0.619485
MTD,0.617647
PKI,0.606618
UAA,0.604779
LOW,0.604779


#### Уровень 3: 'Close'

In [None]:
level = 3

predictions = get_level_predictions(level)
predictions_3D[:, :, level] = predictions

In [None]:
cut_off_level = 0.41
accuracy_by_date = get_accuracy_by_date(predictions, cut_off_level)
accuracy_by_instrument = get_accuracy_by_instr(predictions, cut_off_level)

accuracy by date = 0.55 +- 0.235
accuracy by instrument = 0.55 +- 0.025


In [None]:
accuracy_by_date.head()

Unnamed: 0_level_0,accuracy
dates,Unnamed: 1_level_1
20200312,1.0
20200225,0.989691
20200630,0.981443
20200309,0.979381
20200227,0.973196


In [None]:
accuracy_by_instrument.head()

Unnamed: 0_level_0,accuracy
instruments,Unnamed: 1_level_1
CDNS,0.619485
MTD,0.617647
PKI,0.606618
UAA,0.604779
LOW,0.604779


#### Уровень 4: 'Volume'

In [None]:
level = 4
 
predictions = get_level_predictions(level)
predictions_3D[:, :, level] = predictions

In [None]:
predictions

In [None]:
cut_off_level = 0.5
accuracy_by_date = get_accuracy_by_date(predictions, cut_off_level)
accuracy_by_instrument = get_accuracy_by_instr(predictions, cut_off_level)

In [None]:
accuracy_by_date.head()

In [None]:
accuracy_by_instrument.head()

#### Уровень 5: (High - Low) / Open

In [None]:
level = 5
 
predictions = get_level_predictions(level)
predictions_3D[:, :, level] = predictions

In [None]:
cut_off_level = 0.4
accuracy_by_date = get_accuracy_by_date(predictions, cut_off_level)
accuracy_by_instrument = get_accuracy_by_instr(predictions, cut_off_level)

accuracy by date = 0.55 +- 0.23
accuracy by instrument = 0.55 +- 0.025


In [None]:
accuracy_by_date.head()

Unnamed: 0_level_0,accuracy
dates,Unnamed: 1_level_1
20200312,1.0
20200225,0.989691
20200630,0.981443
20200309,0.979381
20200227,0.973196


In [None]:
accuracy_by_instrument.head()

Unnamed: 0_level_0,accuracy
instruments,Unnamed: 1_level_1
UAA,0.604779
ENPH,0.604779
CZR,0.595588
MTD,0.59375
IVZ,0.591912


#### Уровень 6: (Open - Close) / Open

In [None]:
level = 6
 
predictions = get_level_predictions(level)
predictions_3D[:, :, level] = predictions

In [None]:
cut_off_level = 0.4
accuracy_by_date = get_accuracy_by_date(predictions, cut_off_level)
accuracy_by_instrument = get_accuracy_by_instr(predictions, cut_off_level)

accuracy by date = 0.546 +- 0.236
accuracy by instrument = 0.546 +- 0.025


In [None]:
accuracy_by_date.head()

Unnamed: 0_level_0,accuracy
dates,Unnamed: 1_level_1
20200312,1.0
20200225,0.989691
20200630,0.981443
20200309,0.979381
20200227,0.973196


In [None]:
accuracy_by_instrument.head()

Unnamed: 0_level_0,accuracy
instruments,Unnamed: 1_level_1
MTD,0.59375
ALGN,0.591912
PKI,0.590074
BR,0.586397
ADSK,0.586397


### 8. Композиция всех результатов

In [None]:
predictions_3D.shape

(485, 544, 7)

Уберем уровень Volume и проранжируем результаты.

In [None]:
s = n_levels
ranked_predictions = np.full((n_instruments, n_predictions, s), np.nan)
n = 0
for i in range(n_levels):
  #if i == 4:
  #  continue
  for j in range(predictions_3D.shape[1]):      
    ranked_predictions[:, j, n] = rankdata(predictions_3D[:, j, i], method='ordinal')
  n += 1

ranked_predictions = ranked_predictions.mean(axis=2)

predictions = np.full((n_instruments, n_predictions), np.nan)
for i in range(n_predictions):
  predictions[:, i] = rankdata(ranked_predictions[:, i], method='max')

predictions /= n_predictions

In [None]:
predictions.mean()

0.44694990474396257

In [None]:
cut_off_level = 0.35
accuracy_by_date = get_accuracy_by_date(predictions, cut_off_level)
accuracy_by_instrument = get_accuracy_by_instr(predictions, cut_off_level)

accuracy by date = 0.511 +- 0.069
accuracy by instrument = 0.511 +- 0.029


### __Выводы:__ 
* SVM дает результаты 0.55 +- 0.235
* Ниже результаты (Open - Close) / Open - accuracy by date = 0.546 +- 0.236 
* Над Volume нужно пораборать отдельно. Похоже нужна какая-то предобработка.
* Композиция через совмещеие отранжированных результатов существенно снижает дисперсию (с 0.236 до 0.069), но также и ухудшает результат (с 0.55 до 0.511). Нужно подумать о какой-то другой композиции.
* На этапе сравения моделей лучше использовать не accuracy, а auc, т.к. последняя не зависит от точки отсечения.