# Построение модели

## Выбор инструмента для анализа

Попробуем построить модель для торговли GBP_CAD.


Нефть, природный газ, никель, медь, цинк, уран, битум, пшеница, чечевица, рапс, женьшень, латук, огурец, кукуруза, яблоки, клубника, лес

Канада — крупнейший торговый партнёр США


In [26]:
import re
import pandas as pd
from learning.features import get_features
import matplotlib.pylab as plt

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [3]:
from finam_data.models import *
from finam_data.utils import collect_instrument_candles
from finam.export import Exporter, LookupComparator

In [4]:
exporter = Exporter()

## Все доступные курсы CAD

In [131]:
cad_available_currencies = exporter.lookup(
    name='Cad',
    market=ExtendedMarket.WORLD_CURRENCIES,
    name_comparator=LookupComparator.CONTAINS,
).name.to_list()
for name in cad_available_currencies:
    print (name)
    i, _ = Instrument.objects.get_or_create(
        name=name,
        market=ExtendedMarket.WORLD_CURRENCIES,
    )
    collect_instrument_candles(i)

Aud/Cad
Usd/Cad
Eur/Cad
Gbp/Cad
Cad/Chf
Cad/Jpy
Cad/Usd
Nzd/Cad


## Товары

In [132]:
cad_available_commodities = exporter.lookup(
    name='Цинк|Никель|Медь|Уран|Битум|Пшеница|Кукуруза',
    market=ExtendedMarket.COMMODITIES,
    name_comparator=LookupComparator.CONTAINS,
).name.to_list()
cad_available_commodities += exporter.lookup(
    name='Brent',
    market=ExtendedMarket.COMMODITIES,
    name_comparator=LookupComparator.EQUALS,
).name.to_list()
for name in cad_available_commodities:
    print (name)
    i, _ = Instrument.objects.get_or_create(
        name=name,
        market=ExtendedMarket.COMMODITIES,
    )
    collect_instrument_candles(i)

Никель
Медь
Цинк
Пшеница
Brent


## Скомпилируем датафрейм

In [133]:
feature_regexp = '^({})_close_finam$'.format('|'.join(cad_available_currencies + cad_available_commodities))
features = get_features(lambda x: re.match(feature_regexp, x))

In [134]:
df = None

for f in features:
    print (f.name)
    tmp_df = f.load()
    df = tmp_df if df is None else pd.concat([df, tmp_df], axis=1, ignore_index=False)

Gbp/Cad_close_finam
Aud/Cad_close_finam
Usd/Cad_close_finam
Eur/Cad_close_finam
Cad/Chf_close_finam
Cad/Jpy_close_finam
Cad/Usd_close_finam
Nzd/Cad_close_finam
Никель_close_finam
Медь_close_finam
Цинк_close_finam
Пшеница_close_finam
Brent_close_finam


## Посмотрим на данные

In [135]:
df.tail()

Unnamed: 0_level_0,Gbp/Cad_close_finam,Aud/Cad_close_finam,Usd/Cad_close_finam,Eur/Cad_close_finam,Cad/Chf_close_finam,Cad/Jpy_close_finam,Cad/Usd_close_finam,Nzd/Cad_close_finam,Никель_close_finam,Медь_close_finam,Цинк_close_finam,Пшеница_close_finam,Brent_close_finam
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-08-07 13:00:00+00:00,1.61836,0.89728,1.33371,1.4946,0.73082,,0.7496,0.85728,14755.0,5626.21576,2258.0,482.0,57.7
2019-08-07 14:00:00+00:00,1.62051,0.89792,1.33235,1.49675,0.72961,,0.7503,0.85842,14630.0,5617.39724,2250.0,479.4,57.0
2019-08-07 15:00:00+00:00,1.61695,0.89912,1.33076,1.49251,0.73139,,0.7512,0.85941,14650.0,5653.773635,2250.0,478.6,56.98
2019-08-07 16:00:00+00:00,1.62058,0.90017,1.33309,1.49771,0.72827,,0.7499,0.86129,14810.0,5660.387525,2251.0,485.2,56.1
2019-08-07 17:00:00+00:00,1.62021,0.90123,1.33303,1.49655,0.72894,,0.75,0.86076,15240.0,5669.206045,2251.0,484.0,56.41


In [105]:
df.describe()

Unnamed: 0,Gbp/Cad_close_finam,Aud/Cad_close_finam,Usd/Cad_close_finam,Eur/Cad_close_finam,Cad/Chf_close_finam,Cad/Jpy_close_finam,Cad/Usd_close_finam,Nzd/Cad_close_finam,Никель_close_finam,Медь_close_finam,Цинк_close_finam,Пшеница_close_finam,Brent_close_finam
count,25809.0,25935.0,25976.0,26067.0,25722.0,25165.0,26030.0,25603.0,18186.0,23588.0,3594.0,18957.0,23184.0
mean,1.759104,0.973108,1.312206,1.487154,0.750575,84.970158,0.762365,0.90683,11193.392995,5793.999945,2565.689747,467.70628,57.44794
std,0.125404,0.027959,0.036856,0.043501,0.017212,3.751262,0.021421,0.031552,1827.000673,790.757476,532.209952,39.907992,12.074609
min,1.57424,0.89032,1.20665,1.37698,0.68068,74.927,0.681,0.83039,7580.0,4279.18683,1453.0,389.2,27.4
25%,1.67581,0.95036,1.289865,1.45911,0.74054,82.494,0.7493,0.882985,9805.0,5023.249455,2268.5,432.6,48.63
50%,1.71863,0.97725,1.31358,1.49145,0.75201,84.841,0.761,0.9052,10935.0,5885.259785,2627.0,466.4,55.7
75%,1.78997,0.99421,1.334,1.51301,0.76133,87.361,0.775,0.933295,12610.0,6418.780245,2920.0,502.6,66.8925
max,2.0954,1.03695,1.46848,1.61486,0.79441,95.912,0.8285,0.9901,16380.0,7306.14382,3619.0,586.4,86.49


In [108]:
df.corr()

Unnamed: 0,Gbp/Cad_close_finam,Aud/Cad_close_finam,Usd/Cad_close_finam,Eur/Cad_close_finam,Cad/Chf_close_finam,Cad/Jpy_close_finam,Cad/Usd_close_finam,Nzd/Cad_close_finam,Никель_close_finam,Медь_close_finam,Цинк_close_finam,Пшеница_close_finam,Brent_close_finam
Gbp/Cad_close_finam,1.0,-0.1087,0.380836,0.182748,-0.510217,0.318075,-0.374824,-0.255487,-0.413726,-0.516292,-0.685902,0.241228,-0.468789
Aud/Cad_close_finam,-0.1087,1.0,0.052796,-0.077473,-0.175833,-0.229282,-0.043028,0.787378,-0.299226,-0.041554,0.167808,-0.566114,-0.404316
Usd/Cad_close_finam,0.380836,0.052796,1.0,0.043142,-0.708822,-0.33359,-0.998849,0.226459,-0.351686,-0.514408,-0.550574,0.038498,-0.358011
Eur/Cad_close_finam,0.182748,-0.077473,0.043142,1.0,-0.328853,-0.155543,-0.033191,-0.111624,0.438211,0.394862,0.29714,0.325179,0.368683
Cad/Chf_close_finam,-0.510217,-0.175833,-0.708822,-0.328853,1.0,0.370755,0.704463,-0.27353,0.395548,0.519627,0.517626,-0.079428,0.477527
Cad/Jpy_close_finam,0.318075,-0.229282,-0.33359,-0.155543,0.370755,1.0,0.338215,-0.526695,-0.006463,0.229314,-0.010088,0.24561,0.046548
Cad/Usd_close_finam,-0.374824,-0.043028,-0.998849,-0.033191,0.704463,0.338215,1.0,-0.223064,0.34352,0.51588,0.551433,-0.045848,0.344309
Nzd/Cad_close_finam,-0.255487,0.787378,0.226459,-0.111624,-0.27353,-0.526695,-0.223064,1.0,-0.235736,-0.109013,0.158133,-0.547359,-0.349176
Никель_close_finam,-0.413726,-0.299226,-0.351686,0.438211,0.395548,-0.006463,0.34352,-0.235736,1.0,0.76666,0.671555,0.264677,0.867561
Медь_close_finam,-0.516292,-0.041554,-0.514408,0.394862,0.519627,0.229314,0.51588,-0.109013,0.76666,1.0,0.919344,0.059146,0.748001


In [146]:
# Что-то странное с Cad/Jpy. Нет данных. Дропнем пока.
df['Cad/Jpy_close_finam'].dropna().tail()

time
2019-07-09 20:00:00+00:00    82.880
2019-07-09 21:00:00+00:00    82.821
2019-07-09 22:00:00+00:00    82.929
2019-07-09 23:00:00+00:00    82.958
2019-07-10 00:00:00+00:00    82.934
Name: Cad/Jpy_close_finam, dtype: float64

In [148]:
df = df.drop(['Cad/Jpy_close_finam'], axis=1)

In [156]:
df.head(10)

Unnamed: 0_level_0,Gbp/Cad_close_finam,Aud/Cad_close_finam,Usd/Cad_close_finam,Eur/Cad_close_finam,Cad/Chf_close_finam,Cad/Usd_close_finam,Nzd/Cad_close_finam,Никель_close_finam,Медь_close_finam,Цинк_close_finam,Пшеница_close_finam,Brent_close_finam
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-08-10 12:00:00+00:00,2.03823,0.97048,1.31744,1.44446,0.74533,0.7588,0.86537,10985.0,5161.03883,1862.5,512.0,48.96
2015-08-10 13:00:00+00:00,2.0363,0.96897,1.31685,1.44088,0.74714,0.7591,0.86439,10980.0,5166.550405,1862.5,511.2,48.94
2015-08-10 14:00:00+00:00,2.03605,0.96823,1.31598,1.43867,0.74838,0.7596,0.86325,11085.0,5201.824485,,513.4,49.13
2015-08-10 15:00:00+00:00,2.03504,0.9674,1.3136,1.43983,0.74924,0.7611,0.86389,11140.0,5247.0194,,520.6,49.85
2015-08-10 16:00:00+00:00,2.03777,0.96718,1.31386,1.4384,0.75086,0.761,0.86329,11170.0,5301.032835,,525.6,49.88
2015-08-10 17:00:00+00:00,2.03605,0.96724,1.31187,1.43918,0.75288,0.762,0.86336,11170.0,5290.009685,1872.0,522.6,50.18
2015-08-10 18:00:00+00:00,2.03232,0.96659,1.30763,1.43564,0.75422,0.7645,0.86218,11180.0,5291.112,,524.0,50.13
2015-08-10 19:00:00+00:00,2.02936,0.96648,1.30554,1.43526,0.75419,0.7658,0.86178,,5284.49811,,523.6,50.25
2015-08-10 20:00:00+00:00,2.02951,0.96559,1.30152,1.43478,0.75494,0.7683,0.86152,,5265.758755,,,50.16
2015-08-10 21:00:00+00:00,2.03022,0.96457,1.30194,1.43511,0.75482,0.7679,0.86118,,5263.554125,,,50.2


In [149]:
df.tail()

Unnamed: 0_level_0,Gbp/Cad_close_finam,Aud/Cad_close_finam,Usd/Cad_close_finam,Eur/Cad_close_finam,Cad/Chf_close_finam,Cad/Usd_close_finam,Nzd/Cad_close_finam,Никель_close_finam,Медь_close_finam,Цинк_close_finam,Пшеница_close_finam,Brent_close_finam
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-08-07 13:00:00+00:00,1.61836,0.89728,1.33371,1.4946,0.73082,0.7496,0.85728,14755.0,5626.21576,2258.0,482.0,57.7
2019-08-07 14:00:00+00:00,1.62051,0.89792,1.33235,1.49675,0.72961,0.7503,0.85842,14630.0,5617.39724,2250.0,479.4,57.0
2019-08-07 15:00:00+00:00,1.61695,0.89912,1.33076,1.49251,0.73139,0.7512,0.85941,14650.0,5653.773635,2250.0,478.6,56.98
2019-08-07 16:00:00+00:00,1.62058,0.90017,1.33309,1.49771,0.72827,0.7499,0.86129,14810.0,5660.387525,2251.0,485.2,56.1
2019-08-07 17:00:00+00:00,1.62021,0.90123,1.33303,1.49655,0.72894,0.75,0.86076,15240.0,5669.206045,2251.0,484.0,56.41


In [154]:
# Дропнем все до первой полностью заполненной строки
from learning.preprocessing import drop_until_first_full_field
df = drop_until_first_full_field(df)

In [155]:
df.head()

Unnamed: 0_level_0,Gbp/Cad_close_finam,Aud/Cad_close_finam,Usd/Cad_close_finam,Eur/Cad_close_finam,Cad/Chf_close_finam,Cad/Usd_close_finam,Nzd/Cad_close_finam,Никель_close_finam,Медь_close_finam,Цинк_close_finam,Пшеница_close_finam,Brent_close_finam
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-08-10 12:00:00+00:00,2.03823,0.97048,1.31744,1.44446,0.74533,0.7588,0.86537,10985.0,5161.03883,1862.5,512.0,48.96
2015-08-10 13:00:00+00:00,2.0363,0.96897,1.31685,1.44088,0.74714,0.7591,0.86439,10980.0,5166.550405,1862.5,511.2,48.94
2015-08-10 14:00:00+00:00,2.03605,0.96823,1.31598,1.43867,0.74838,0.7596,0.86325,11085.0,5201.824485,,513.4,49.13
2015-08-10 15:00:00+00:00,2.03504,0.9674,1.3136,1.43983,0.74924,0.7611,0.86389,11140.0,5247.0194,,520.6,49.85
2015-08-10 16:00:00+00:00,2.03777,0.96718,1.31386,1.4384,0.75086,0.761,0.86329,11170.0,5301.032835,,525.6,49.88


In [157]:
# заполним пропущенные значения предыдущими
df = df.fillna(method='bfill')

## Попробуем в лоб обучить градиентный бустинг

In [214]:
X_raw = df
Y_raw = df[['Gbp/Cad_close_finam']]

In [347]:
new_df = df.drop(df.columns, axis=1)
for column in X_raw:
    for i in range(25, 0, -1):
        new_column = '{}_{}'.format(column, i)
        new_df[new_column] = df[column].shift(i-1) - df[column].shift(i)
    new_df[column] = df[column]
new_df = new_df[25:]

Gbp/Cad_close_finam
Aud/Cad_close_finam
Usd/Cad_close_finam
Eur/Cad_close_finam
Cad/Chf_close_finam
Cad/Usd_close_finam
Nzd/Cad_close_finam
Никель_close_finam
Медь_close_finam
Цинк_close_finam
Пшеница_close_finam
Brent_close_finam


In [415]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [425]:
scaler = StandardScaler()
X_norm = scaler.fit_transform(new_df)

In [426]:
X = X_norm[:-1]
Y = Y_raw.diff().apply(lambda x: 1 if x[0] > 0 else 0, axis=1)[25:][1:]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, shuffle=False)

In [427]:
classifier = XGBClassifier()
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

In [428]:
print (accuracy_score(Y_train, classifier.predict(X_train)))
print (accuracy_score(Y_test, Y_pred))

0.634701760689903
0.5559796437659033


# TODO
Градиентный бустинг в лоб дал accuracy 0.63 на обучении и 0.55 на тесте.
Несмотря на то, что модель явно переобучилась, для попытки обучиться "в лоб" без подбора параметров и регуляризации - получилось неплохо.

- поиграться с параметрами модели. Попробовать grid search;
- поиграться с глубиной временного окна;
- добавить регуляризацию;
- добавить категориальные признаки (время, месяц, сезон, день недели...);
- попробовать другие модели (в первую очередь LSTM);
- научиться в обучение с подкреплением;
- добавить в модель признаки второй страны (пока добавил только CAD);
- попробовать иначе заполнять NaN. Сначала дропнуть все строки, у которых NaN в таргетной колонке. Потом уже заполнять предыдущими значениями.