In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, f_classif

In [2]:
# mute some error warnings from sklearn
np.seterr(divide='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [3]:
STORAGE_PATH = "../../persisted_data/feather/{}.feather"

def load_data_set(name):
    data = pd.read_feather(STORAGE_PATH.format(name))
    return data

In [4]:
all_stocks_sets = {
    # future interval: data set
    10: load_data_set("all_stocks_10spy_10shift_normalized"),
    #20: load_data_set("all_stocks_10spy_20shift_normalized"),
    #50: load_data_set("all_stocks_10spy_50shift_normalized"),
    #100: load_data_set("all_stocks_10spy_100shift_normalized")
}

In [16]:
def calculate_correlations(data_set, label, feature_transform=lambda feature: feature, label_transform=lambda label: label):
    # choose a data set
    #samples = all_stocks_sets[10]
    # calculate the correlations for a data set
    correlations = feature_transform(data_set).corrwith(np.log(label_transform(data_set[label])))

    # filter out the future columns
    future_columns = [column for column in data_set if "future" in column]
    correlations.drop(future_columns, inplace=True)
    correlations.drop("current_price", inplace=True)
    correlations.drop("index", inplace=True)

    # show the correlations
    pd.set_option("display.max_rows", None)
    pd.options.display.float_format = '{:,.3f}'.format
    return pd.DataFrame(correlations.sort_values(key=pd.Series.abs, ascending=False))


In [17]:
calculate_correlations(all_stocks_sets[10], "future_price")

Unnamed: 0,0
rate_of_change50,-0.105
rate_of_change20,-0.103
rate_of_change100,-0.086
regression_lower20,-0.057
horizontal_lower20,0.054
sma20,0.048
regression_upper20,0.046
lwma50,0.042
lwma20,0.042
bollinger_upper20_2,0.04


In [19]:
calculate_correlations(all_stocks_sets[10], "future_volatility50", feature_transform=np.abs)

Unnamed: 0,0
bollinger_position50_2,0.51
bollinger_position100_2,0.363
bollinger_threshold50_2,0.345
bollinger_threshold100_2,0.271
chande20,0.258
horizontal_position50,0.24
chande50,0.208
bollinger_position20_2,0.206
horizontal_threshold100,0.194
macd12_26,0.193


In [6]:
def f_test_selection(samples, number_of_features, future_value="future_price"):
    # define the select using the f_test 
    selector = SelectKBest(f_regression)
    
    # select the wanted columns
    features = samples.loc[:, ~samples.columns.str.contains("future|current", regex=True)]
    
    # perform the f-test
    fit = selector.fit(features, samples[future_value])
    
    # construct a series for returnin the selected columns with their score
    feature_scores = pd.Series(fit.scores_, index=features.columns)
    # sort the features
    feature_scores.sort_values(axis=0, ascending=False, inplace=True)
    # take only the best fits
    return feature_scores[:number_of_features]

In [7]:
selected = f_test_selection(all_stocks_sets[10], 30)
print(selected)

macd12_26                 161.225
macd_signal12_26           58.909
cci50                      42.892
horizontal_lower20         26.290
horizontal_lower50         10.363
ma_cross50_200              9.854
horizontal_lower200         7.654
regression_threshold20      5.411
regression_threshold100     5.243
chande100                   5.183
horizontal_lower100         4.873
ma_cross20_50               4.699
cci_threshold50             4.125
lwma10                      3.956
regression_position20       3.277
regression_upper20          3.255
regression_position100      3.161
volatility10                3.040
ema10                       2.691
aaron_oscillator40          2.259
bollinger_lower20_2         2.232
rsi20                       2.206
aaron_oscillator25          2.055
horizontal_upper20          1.979
volatility20                1.768
aaron_up40                  1.688
aaron_oscillator15          1.681
aaron_down40                1.649
sma10                       1.626
aaron_down25  

In [8]:
samples = all_stocks_sets[10]
f_classifier_selector = SelectKBest(f_classif, k=30)
features = samples.loc[:, ~samples.columns.str.contains("future|current", regex=True)]
labels = np.sign(samples["future_price"] - 1)
f_classifier_selector.fit(features, labels)
selected_indices = f_classifier_selector.get_support(indices=True)
selected_features = features.iloc[:, selected_indices]
print(selected_features.columns)

Index(['ma_trend50_200', 'macd12_26', 'macd_signal12_26', 'aaron_up15',
       'aaron_up25', 'aaron_up40', 'bollinger_position20_2',
       'bollinger_position50_2', 'bollinger_position100_2',
       'bollinger_position200_2', 'rsi7', 'rsi14', 'rsi20',
       'horizontal_lower20', 'horizontal_position20', 'horizontal_threshold20',
       'regression_position20', 'horizontal_lower50', 'horizontal_position50',
       'horizontal_threshold50', 'horizontal_lower100',
       'horizontal_position100', 'horizontal_threshold100',
       'regression_position100', 'horizontal_lower200',
       'horizontal_position200', 'horizontal_threshold200',
       'regression_position200', 'chande50', 'chande100'],
      dtype='object')
