In [120]:
from pairmaker import select_pairs, calculate_metrics, plot_pairs
import numpy as np
import pandas as pd
import yfinance as yf

In [121]:
pairs_train = pd.read_csv("pairs13_16.csv")
pairs_test = pd.read_csv("pairs16_19.csv")
prices_train = pd.read_csv('vbr13_16.csv', index_col='Date')
prices_test = pd.read_csv('vbr16_19.csv', index_col='Date')


# calculate cumulative returns
cumret_train = np.log(prices_train).diff().cumsum()+1
cumret_train.dropna(how='all', inplace=True)
cumret_train.dropna(axis=1, inplace=True)
cumret_test = np.log(prices_test).diff().cumsum()+1
cumret_test.dropna(how='all', inplace=True)

# divide by first row so that all prices start at 1
cumret_train = cumret_train / cumret_train.iloc[0]
cumret_test = cumret_test / cumret_test.iloc[0]



  result = func(self.values, **kwargs)


In [187]:
metrics_train_form = pd.read_csv("metric_form_13-16.csv")
metrics_train_trade = pd.read_csv("metric_trade_13-16.csv")
metrics_test_form = pd.read_csv("metric_form_16-19.csv")
metrics_test_trade = pd.read_csv("metric_trade_16-19.csv")
metrics_test_form = metrics_test_form.set_index("Pair")
metrics_train_form = metrics_train_form.set_index("Pair")
metrics_test_trade = metrics_test_trade.set_index("Pair")
metrics_train_trade = metrics_train_trade.set_index("Pair")

In [189]:
data_train = metrics_train_form
data_train['Num zero-crossings trade'] = metrics_train_trade['Num zero-crossings']
data_test = metrics_test_form
data_test['Num zero-crossings trade'] = metrics_test_trade['Num zero-crossings']
# convert data to numeric types
data_train

Unnamed: 0_level_0,Distance,CADF p-value,ADF p-value,Spread SD,Pearson r,Num zero-crossings,Hurst Exponent,Half-life of mean reversion,% days within historical 2-SD band,Hedge ratio,Num zero-crossings trade
Pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
IEX-GRA,1.632826,0.006148,0.001196,0.059464,0.925808,56,0.378031,17.773619,94.437086,1.444374,0
IEX-ADS,1.007115,0.006580,0.001291,0.036677,0.972442,79,0.357466,16.505057,95.231788,0.743826,2
MOH-SCI,1.819716,0.002281,0.000395,0.066270,0.976015,52,0.304781,12.654336,97.086093,1.373158,9
MOH-FHN,2.578471,0.002251,0.000390,0.093902,0.951233,54,0.356214,14.266102,95.629139,2.089341,4
MOH-POLY,3.428606,0.005746,0.001110,0.124862,0.912004,66,0.338789,14.860935,94.569536,2.704551,0
...,...,...,...,...,...,...,...,...,...,...,...
KRO-MUR,2.774052,0.005237,0.000996,0.101025,0.926225,41,0.440060,20.279975,94.834437,0.972470,8
KRO-SWN,2.681712,0.008362,0.001681,0.097662,0.931234,66,0.426719,21.526618,91.655629,0.554539,14
KRO-HSC,2.518113,0.006980,0.001378,0.091704,0.939630,47,0.394814,18.663369,96.821192,0.875248,4
UBA-IRDM,1.303215,0.008677,0.001748,0.047460,0.769064,66,0.386655,17.538026,96.158940,0.347603,1


In [190]:
X_train = data_train.values[:,:10]
X_test = data_test.values[:,:10]
y_train = data_train.values[:,10]
y_test = data_test.values[:,10]

In [191]:


# what fraction of samples has more than 6 crossings?
len(y_train[y_train>6]) / len(y_train)



0.31547104580812446

In [198]:
from sklearn.preprocessing import Binarizer

# binarize dependent variable
binarizer = Binarizer(threshold=6).fit(y_train.reshape(-1,1))
y_train_bin = binarizer.transform(y_train.reshape(-1,1))
binarizer = Binarizer(threshold=6).fit(y_test.reshape(-1,1))
y_test_bin = binarizer.transform(y_test.reshape(-1,1))



In [199]:
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC



In [200]:
def top10_accuracy(estimator, X, y):
    '''
    compute accuracy of top 10 predictions:
    select top 10 samples with the highest probability of belonging to class 1
    return the fraction of samples actually belonging to class1
    '''
    pred_prob = estimator.predict_proba(X)
    top10_ind = np.argsort(pred_prob[:,1])[-10:]
    score = sum(y[top10_ind] == np.ones(10))/10
    
    return score


In [201]:
def test_models_class(models, X=X_train, y=y_train_bin.flatten(), n_folds=10, scoring=top10_accuracy, seed=45):
    '''
    cross-validate each model in models using n_folds and scoring
    '''
    
    results = []
    names = []
    for name,model in models:
        kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        print(f'{name}: {cv_results.mean()} ({cv_results.std()})')
        
    return results, names

In [202]:
models = []
models.append(('LR', LogisticRegression(max_iter=300, class_weight='balanced')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(class_weight='balanced')))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(class_weight='balanced', probability=True)))


In [203]:
results,names = test_models_class(models, y=y_train_bin.flatten())


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=300).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=300).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=300).
You might also want to sca

LR: 0.48999999999999994 (0.14456832294800961)
LDA: 0.44000000000000006 (0.11135528725660043)
KNN: 0.5599999999999999 (0.16852299546352714)
CART: 0.42000000000000004 (0.13999999999999999)
NB: 0.48 (0.15999999999999998)
SVM: 0.30000000000000004 (0.1341640786499874)


In [204]:
pipelines = []
for name,model in models:
    pipelines.append((f'scaled{name}', Pipeline([('scaler', PowerTransformer(method='box-cox')),
                                                 (name,model)])))
    
results,names = test_models_class(pipelines, y=y_train_bin.flatten())


scaledLR: 0.41999999999999993 (0.10770329614269007)
scaledLDA: 0.45 (0.09219544457292887)
scaledKNN: 0.6399999999999999 (0.14966629547095767)
scaledCART: 0.4600000000000001 (0.11999999999999998)
scaledNB: 0.67 (0.12688577540449522)
scaledSVM: 0.61 (0.09433981132056601)


In [205]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

pipelines = []
for name,model in models:
    pipelines.append((f'scaled{name}_4ft', Pipeline([('scaler', PowerTransformer(method='box-cox')),
                                                 ('KBest', SelectKBest(score_func=f_classif, k=4)),
                                                 (name,model)])))
    
results,names = test_models_class(pipelines, y=y_train_bin.flatten())


scaledLR_4ft: 0.35 (0.14317821063276354)
scaledLDA_4ft: 0.42000000000000004 (0.132664991614216)
scaledKNN_4ft: 0.7 (0.1341640786499874)
scaledCART_4ft: 0.41000000000000003 (0.12999999999999998)
scaledNB_4ft: 0.67 (0.15524174696260024)
scaledSVM_4ft: 0.5 (0.161245154965971)


In [206]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

ensembles = []
ensembles.append(('ADA', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
ensembles.append(('RF', RandomForestClassifier(class_weight='balanced')))
ensembles.append(('ET', ExtraTreesClassifier(class_weight='balanced')))

results,names = test_models_class(ensembles, y=y_train_bin.flatten())


ADA: 0.74 (0.10198039027185572)
GBM: 0.7499999999999999 (0.13601470508735444)
RF: 0.77 (0.13453624047073712)
ET: 0.7699999999999999 (0.12688577540449522)


In [207]:
from sklearn.model_selection import GridSearchCV

n_estimators=[10,50,100,150]
max_depth = [5,7,10,15]
max_leaf_nodes = [50,100,150]
min_samples_leaf = [1,3,5,7,10,15]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth, 
                  max_leaf_nodes=max_leaf_nodes, min_samples_leaf=min_samples_leaf)
model = ExtraTreesClassifier(class_weight='balanced')
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=45)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=top10_accuracy, cv=kfold)
grid_result = grid.fit(X_train, y_train_bin.flatten())


In [208]:
print(grid_result.best_score_)
print(grid_result.best_params_)

0.82
{'max_depth': 10, 'max_leaf_nodes': 150, 'min_samples_leaf': 3, 'n_estimators': 150}


In [209]:
from sklearn.linear_model import LogisticRegressionCV

models = [('scaledLR_4ft_cv', Pipeline([('scaler', PowerTransformer(method='box-cox')), 
                            ('KBest', SelectKBest(score_func=f_classif, k=4)), 
                            ('LR', LogisticRegressionCV(max_iter=300, class_weight='balanced', 
                                                        scoring=top10_accuracy))]))]

results,names = test_models_class(models, y=y_train_bin.flatten())


scaledLR_4ft_cv: 0.31999999999999995 (0.12489995996796797)


In [None]:
scaler = PowerTransformer(method='box-cox').fit(X_train)
scaledX = scaler.transform(X_train)
scaledX_4ft = SelectKBest(score_func=f_classif, k=4).fit_transform(X_train, y_train_bin.flatten())
c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2]
kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid = dict(C=c_values, kernel=kernel_values)
model = SVC(class_weight='balanced', probability=True)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=45)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=top10_accuracy, cv=kfold)
grid_result = grid.fit(scaledX_4ft, y_train_bin.flatten())


In [None]:
scaler = PowerTransformer(method='box-cox').fit(X_train)
scaledX = scaler.transform(X_train)
scaledX_4ft = SelectKBest(score_func=f_classif, k=4).fit_transform(X_train, y_train_bin.flatten())
param_grid = dict(n_neighbors=np.arange(1,22,2))
model = KNeighborsClassifier()
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=45)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=top10_accuracy, cv=kfold)
grid_result = grid.fit(scaledX_4ft, y_train_bin.flatten())


In [None]:
# train ET using parameters obtained in grid search
model = ExtraTreesClassifier(class_weight='balanced', max_depth=10, max_leaf_nodes=50, 
                             min_samples_leaf=15, n_estimators=10)
model.fit(X_train, y_train_bin.flatten())



In [None]:
pred_prob = model.predict_proba(X_test) # predict probabilities
# select top 10 pairs with highest probability of belonging to class 1 (sort in descending order)
top10_ind = np.argsort(pred_prob[:,1])[-10:][::-1] 
data_test.iloc[top10_ind,:] # show data

In [None]:


top50_ind = np.argsort(pred_prob[:,1])[-50:][::-1]
top50_pairs = data_test.iloc[top50_ind,:].index

selected_pairs = []
selected_stocks = []
for pair in top50_pairs:
    s1,s2 = parse_pair(pair)
    if (s1 not in selected_stocks) and (s2 not in selected_stocks):
        selected_stocks.append(s1)
        selected_stocks.append(s2)
        selected_pairs.append(pair)
        
        if len(selected_pairs)==5:
            break



In [None]:
plot_pairs(selected_pairs, test_form, test_trade)