In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn import metrics

from ast import literal_eval

from mlxtend.plotting import plot_decision_regions
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression


import warnings
import numpy as np
from collections import OrderedDict

from lob_data_utils import lob, db_result, model
from lob_data_utils.svm_calculation import lob_svm


sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
data_length = 24000
stocks = ['3459', '2748', '9268']
should_save_fig = True

In [3]:
d_stocks = {}
d_cv_stocks = {}
d_test_stocks = {}
for s in stocks:
    d,  d_test = lob.load_prepared_data(s, length=data_length)
    d.index = pd.to_datetime(d['Unnamed: 0'].values)
    d_test.index = pd.to_datetime(d_test['Unnamed: 0'].values)
    d['prev_queue_imbalance'] = [None] + d['queue_imbalance'].iloc[0:len(d)-1].values.tolist()
    d.dropna(inplace=True)
    d_test['prev_queue_imbalance'] = [None] + d_test['queue_imbalance'].iloc[0:len(d_test)-1].values.tolist()
    d_test.dropna(inplace=True)
    d_stocks[s] = d
    d_test_stocks[s] = d_test

In [4]:
d_stocks['3459'].head()

Unnamed: 0.1,Unnamed: 0,bid,ask,bid_price,ask_price,mid_price,sum_sell_ask,sum_buy_bid,mid_price_indicator,queue_imbalance,prev_queue_imbalance
2013-09-02 08:31:00,2013-09-02 08:31:00,"[(550.0, 4600.0), (590.0, 3500.0), (666.0, 956...","[(749.5, 19522.0), (750.0, 51865.0), (750.5, 1...",748.5,749.5,749.0,19522.0,8078.0,0.0,-0.414638,-0.573878
2013-09-02 08:32:00,2013-09-02 08:32:00,"[(550.0, 4600.0), (590.0, 3500.0), (666.0, 956...","[(749.5, 13371.0), (750.0, 51046.0), (750.5, 1...",748.5,749.5,749.0,13371.0,16818.0,0.0,0.114181,-0.414638
2013-09-02 08:33:00,2013-09-02 08:33:00,"[(550.0, 4600.0), (590.0, 3500.0), (666.0, 956...","[(749.5, 20645.0), (750.0, 51474.0), (750.5, 1...",748.5,749.5,749.0,20645.0,7206.0,0.0,-0.482532,0.114181
2013-09-02 08:34:00,2013-09-02 08:34:00,"[(550.0, 4600.0), (590.0, 3500.0), (666.0, 956...","[(749.5, 14676.0), (750.0, 51474.0), (750.5, 1...",748.5,749.5,749.0,14676.0,7206.0,0.0,-0.341376,-0.482532
2013-09-02 08:35:00,2013-09-02 08:35:00,"[(550.0, 4600.0), (590.0, 3500.0), (666.0, 956...","[(749.0, 9652.0), (749.5, 35846.0), (750.0, 42...",748.0,749.0,748.5,9652.0,5395.0,1.0,-0.282914,-0.341376


## SVM with queue imbalance

In [5]:
df_res = {}
for s in stocks:
     df_res_temp = pd.read_csv('../svm_queue_imbalance/res_svm/svm_linear_{}_len{}.csv'.format(s, data_length))
#     df_res_temp = df_res_temp.append(
#         pd.read_csv('../svm_queue_imbalance/res_svm/svm_sigmoid_{}_len{}.csv'.format(s, data_length)))
    df_res_temp = df_res_temp.append(
        pd.read_csv('../svm_queue_imbalance/res_svm/svm_rbf_{}_len{}.csv'.format(s, data_length)))
    df_res[s] = df_res_temp
    df_res[s].index = list(range(len(df_res[s])))


IndentationError: unindent does not match any outer indentation level (<ipython-input-5-d28b695134df>, line 6)

In [None]:
df_best_svm = pd.DataFrame()
for s in stocks:
    idx_max = df_res[s]['matthews'].idxmax()
    df_best_svm = df_best_svm.append(df_res[s].loc[idx_max])
df_best_svm

In [None]:
print(df_best_svm[['stock', 'kernel', 'C', 'gamma', 'coef0']].to_latex())

In [None]:
 from sklearn import utils

def get_classes_weights(y_train):
    classes = np.unique(y_train)
    class_weight_list = utils.class_weight.compute_class_weight('balanced', classes, y_train)
    class_weights = {classes[0]: class_weight_list[0], classes[1]: class_weight_list[1]}
    return class_weights

def fit_best_svm_classifier(df_best_svm, df, stock=None):
    stock = int(stock)
    gamma = df_best_svm[df_best_svm['stock'] == stock]['gamma'].values[0]
    coef0 = df_best_svm[df_best_svm['stock'] == stock]['coef0'].values[0]
    c = df_best_svm[df_best_svm['stock'] == stock]['C'].values[0]
    kernel = df_best_svm[df_best_svm['stock'] == stock]['kernel'].values[0]

    X = df['queue_imbalance'].values.reshape(-1, 1)
    y = df['mid_price_indicator']
   
    weights = get_classes_weights(y)
    clf = SVC(gamma=gamma, C=c, coef0=coef0, kernel=kernel, random_state=23131, class_weight=weights)
    clf.fit(X, y)
    return clf
    

In [None]:
def get_scores_dict_for_data(functions_to_run, dfs, log_clf, stock):
    scores = {'stock': stock}
    for func_name, func in functions_to_run.items():
        for df_name, df in dfs.items():
            pred = log_clf.predict(df['queue_imbalance'].values.reshape(-1, 1))
            df['pred'] = pred
            scores['{}_{}'.format(df_name, func_name)] = func(df['mid_price_indicator'], pred)
    return scores
            
functions_to_run = {'precision': metrics.precision_score, 'roc_auc': metrics.roc_auc_score,
                   'f1_score': metrics.f1_score, 'recall': metrics.recall_score, 
                   'matthews': metrics.matthews_corrcoef, 'kappa': metrics.cohen_kappa_score}
scores = []
for stock in stocks:
    log_clf = fit_best_svm_classifier(df_best_svm, d_stocks[stock], stock=stock)
    dfs = {'train': d_stocks[stock], 'test': d_test_stocks[stock], }
    res_validation = model.validate_model(
        fit_best_svm_classifier(df_best_svm, d_stocks[stock], stock=stock), 
        d_stocks[stock][['queue_imbalance']], d_stocks[stock]['mid_price_indicator'])
    res = get_scores_dict_for_data(functions_to_run, dfs, log_clf, stock)
    res = {**res, **res_validation}
    scores.append(res)
df_scores = pd.DataFrame(scores, index=stocks)

In [None]:
def convert_scores(df, column):
    scores = []
    for i, row in df.iterrows():
        scores.append(np.mean(row[column]))
    return scores
scores_columns = ['f1', 'kappa', 'matthews', 'precision', 'recall', 'roc_auc', 'train_f1', 'train_kappa',
       'train_matthews', 'train_precision', 'train_recall', 'train_roc_auc']

for col in scores_columns:
    df_scores[col] = convert_scores(df_scores, col)
df_scores

In [None]:
print(df_scores[['train_matthews',  'matthews', 'test_matthews', 'train_roc_auc',  'roc_auc', 'test_roc_auc']].to_latex())

In [None]:
df_scores[['train_precision', 'test_precision', 'train_recall', 'test_recall']]

In [None]:
df_scores[['train_f1', 'test_f1_score', 'train_roc_auc', 'test_roc_auc']]

In [None]:
f, ax = plt.subplots(1, 3, figsize=(27,6))
for i in range(len(stocks)):
    s = stocks[i]
    d_stocks[s]['Predicition of Mid Price Indicator'] = d_stocks[s]['pred']
    d_stocks[s]['Mid Price Indicator'] = d_stocks[s]['mid_price_indicator']
    d_stocks[s][['Predicition of Mid Price Indicator', 'Mid Price Indicator']].plot(
        kind='kde', ax=ax[i])
    ax[i].set_title('Density of Mid Price Indicator and its prediction {} on training data'.format(s))
    ax[i].legend(loc='lower right')
if should_save_fig:
    print('Saving')
    plt.savefig('density_of_mid_price_and_prediction_training_data_svm.png')

In [None]:
for s in stocks:
    d_stocks[s]['queue_imbalance'].plot(kind='kde')
    d_stocks[s]['mid_price_indicator'].plot(kind='kde')

In [None]:
df_scores[['train_matthews', 'matthews', 'test_matthews']]

In [None]:
print('all 3')
df_scores[['train_roc_auc', 'roc_auc', 'test_roc_auc']]

In [None]:
print(df_scores[['train_matthews', 'matthews', 'test_matthews', 
                 'train_roc_auc', 'roc_auc', 'test_roc_auc']].to_latex())

In [None]:
print(df_scores[['f1', 'test_f1_score', 'precision', 'test_precision', 'recall', 'test_recall']].to_latex())

In [None]:
f, ax = plt.subplots(1, 3, figsize=(27,6))
for i in range(len(stocks)):
    stock = stocks[i]
    d_stocks[stock]['Predicition of Mid Price Indicator'] = d_stocks[stock]['pred']
    d_stocks[stock]['Mid Price Indicator'] = d_stocks[stock]['mid_price_indicator']
    sns.heatmap(metrics.confusion_matrix(d_stocks[stock]['mid_price_indicator'], d_stocks[stock]['pred']), 
               ax=ax[i], annot=True, fmt='d', xticklabels=['Negative', 'Positive'], 
                yticklabels=['Negative', 'Positive'])
    
    ax[i].set_title('{}'.format(stock))
    ax[i].set_ylabel('True Mid Price Indicator')
    ax[i].set_xlabel('Predicted Mid Price Indicator')
plt.tight_layout()
if should_save_fig:
    print('Saving figure')
    plt.savefig('svm_confusion_matrix.png')

In [None]:
df_scores.to_csv('res_overview_all_three_svm.csv')

In [None]:
df_best_svm

In [None]:
print('Pivot values')
for i in range(len(stocks)):
    stock = stocks[i]
    df = d_stocks[stock]
    print(np.mean([np.min(df[df['pred'] == 1]['queue_imbalance']), 
                  np.max(df[df['pred'] == 0]['queue_imbalance'])]))

In [None]:
f, ax = plt.subplots(3, 1, figsize=(35, 15), sharex=True)
i = 0

for i in range(len(stocks)):
    s = stocks[i]
    df = d_stocks[s]
    X = d_stocks[s][['queue_imbalance']].values
    y = d_stocks[s]['mid_price_indicator'].values.astype(np.integer)

    clf = fit_best_svm_classifier(df_best_svm, d_stocks[s], stock=s)
    plot_decision_regions(X[0:900], y[0:900], clf=clf,ax=ax[i], colors=','.join(['orange', 'blue']))
    ax[i].set_xlabel('Queue Imbalance')
    ax[i].set_title('SVM Decision Regions for {} on training data'.format(s))
    ax[i].set_xlim(-1.01, 1.01)
plt.tight_layout()
if should_save_fig:
    print('Saving')
    plt.savefig('svm_decision_region.png')