# Next day prediction 

In [11]:
# ---------------- import
import pandas as pd
import numpy as np
import scipy as sc
from datetime import datetime,timezone
from dateutil import parser
from pprint import pprint

from sklearn import base
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures

import sklearn.metrics as metrics


# models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV

In [6]:
# ----------------TimeSeriesSplit Custom
class TimeSeriesSplitCustom(TimeSeriesSplit):
    
    def __init__(self, n_splits,test_n,train_n):
        super(TimeSeriesSplitCustom, self).__init__(n_splits)
        self.test_n = test_n
        self.train_n = train_n

    def split(self, X, y=None, groups=None):
        data_length= len(X) + 1
        n_samples = len(X) + 1
        n_splits = self.n_splits
        n_folds = n_splits + 1
        if n_folds > n_samples:
            raise ValueError(
                ("Cannot have number of folds ={0} greater"
                 " than the number of samples: {1}.").format(n_folds,
                                                             n_samples))
        indices = np.arange(n_samples)
        test_size = self.test_n 
        test_starts = range(test_size + n_samples % n_folds, n_samples, test_size)
        for test_start in test_starts:
            train_i = indices[:test_start][-self.train_n:]
            test_i =  indices[test_start:test_start + test_size]
            if len(train_i) == self.train_n:
                yield (indices[:test_start][-self.train_n:], indices[test_start:test_start + test_size])

# ----------------DEMO
X = np.arange(1,1000)
t = TimeSeriesSplitCustom(n_splits=3,test_n=6,train_n=6)
for tr, te in t.split(X):
    print(tr, te)

[0 1 2 3 4 5] [ 6  7  8  9 10 11]
[ 6  7  8  9 10 11] [12 13 14 15 16 17]
[12 13 14 15 16 17] [18 19 20 21 22 23]
[18 19 20 21 22 23] [24 25 26 27 28 29]
[24 25 26 27 28 29] [30 31 32 33 34 35]
[30 31 32 33 34 35] [36 37 38 39]


In [331]:
#---------------load data
s = pd.read_csv("stocks_day/stocks_features_FTSE.csv",na_values=['NA'])
s.date = [parser.parse(x) for x in s.date] 

In [332]:
#---------------remove NA data from target
#s=s.sort_values("date",ascending=False).reset_index(drop=True)
s = s[np.isfinite(s.target_pure)]

In [357]:
#------------Transformer training data: cutoff delta stoc movement
class TransformerFilterDelta(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, threshold):
        self.threshold = threshold  
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X=X[abs(X.target_pure) < self.threshold]
        return X
    
#------------Transformer training data: cutoff delta stoc movement
class TransformerFilterDeltaTarget(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, threshold):
        self.threshold = threshold  
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X.ix[abs(X.target_pure) < self.threshold,'target'] = 'noise'
        return X.target

#------------Transformer training n
class TransformerTrainN(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, train_n):
        self.train_n = train_n  
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X[-self.train_n:]
        return X
    
#------------Transformer features select
class TransformerFeatureSelect(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, fselect):
        self.fselect = fselect  
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X=X.filter(regex=("N225|AXJO|GSPC|TODAYOPEN"))
        #X=X.filter(regex=("f."))
        return X

In [445]:
#------------Ridge
pipe1 = Pipeline([
        ('FeatureSelect', TransformerFeatureSelect(fselect=None)),
        #('TrainN', TransformerTrainN(train_n=1000)),
        #('PolyFeatures', PolynomialFeatures()),
        ('StandardScaler', StandardScaler()),
        #('PCA', PCA()),
        ('RC', RidgeClassifier(normalize=False)) 
])

grid_RC = {
           'RC__alpha' : [0,0.25,0.5,0.75,1,2,3,4,5,6,7,8]
            }

#-------------Logistic
pipe2 = Pipeline([
        ('FeatureSelect', TransformerFeatureSelect(fselect=None)),
        ('StandardScaler', StandardScaler()),
        #('PCA', PCA()),
        ('LR', LogisticRegression()) 
])

grid_LR = {
           'LR__penalty' : ['l1','l2'],
           'LR__C' : [0.1,0.3,0.5,0.7,1,2,3],
            'LR__fit_intercept' : [True]
            }

s.shape

(2591, 141)

In [456]:
#---------------create test Set
ts_split = TimeSeriesSplitCustom(n_splits=2,test_n=14,train_n=1500).split(np.arange(len(s)))
backprop = [(train_index, test_index) for train_index, test_index in ts_split ]
len(backprop)

78

In [457]:
TESTDAYS = []
PRED = []
for i in np.arange(0,70):
    
    train_i = backprop[i][0]
    test_i = backprop[i][1]
    train_d = TransformerTrainN(train_n=1400).fit_transform(s.iloc[train_i,])
    test_d = s.iloc[test_i,]
   
    y=TransformerFilterDeltaTarget(threshold=0).fit_transform(train_d.ix[:,['target','target_pure']])
    y=TransformerTrainN(train_n=1400).fit_transform(y)
    
    #cv_time = TimeSeriesSplitCustom(n_splits=20,test_n=50,train_n=1000)
    #pipe1_cv = GridSearchCV(cv=TimeSeriesSplit(n_splits=20), estimator=pipe1, n_jobs=6, param_grid=grid_RC, verbose=0)
    pipe1_cv = GridSearchCV(cv=TimeSeriesSplit(n_splits=20), estimator=pipe2, n_jobs=6,param_grid=grid_LR, verbose=0)
  
    pipe1_cv.fit(X=train_d,y=y)
    pred = pipe1_cv.predict(test_d)
    
    TESTDAYS.extend(np.array(test_d.target))
    PRED.extend(np.array(pred))
    acc = metrics.accuracy_score(TESTDAYS,PRED)
    print(pipe1_cv.best_params_, acc, len(TESTDAYS) )
    print(metrics.classification_report(TESTDAYS,PRED))

{'LR__C': 3, 'LR__fit_intercept': True, 'LR__penalty': 'l2'} 0.571428571429 14
             precision    recall  f1-score   support

  down_stay       0.00      0.00      0.00         3
  up_change       0.73      0.73      0.73        11

avg / total       0.57      0.57      0.57        14

{'LR__C': 0.1, 'LR__fit_intercept': True, 'LR__penalty': 'l1'} 0.678571428571 28
             precision    recall  f1-score   support

  down_stay       0.45      0.62      0.53         8
  up_change       0.82      0.70      0.76        20

avg / total       0.72      0.68      0.69        28

{'LR__C': 0.1, 'LR__fit_intercept': True, 'LR__penalty': 'l1'} 0.690476190476 42
             precision    recall  f1-score   support

  down_stay       0.58      0.69      0.63        16
  up_change       0.78      0.69      0.73        26

avg / total       0.71      0.69      0.69        42

{'LR__C': 0.1, 'LR__fit_intercept': True, 'LR__penalty': 'l1'} 0.625 56
             precision    recall  f1-score

In [458]:
pipe1_cv.predict_proba(test_d)

#cali = CalibratedClassifierCV(pipe1_cv.best_estimator_,cv='prefit').fit(X=train_d,y=y)

array([[ 0.34336289,  0.65663711],
       [ 0.40218346,  0.59781654],
       [ 0.51756036,  0.48243964],
       [ 0.47752618,  0.52247382],
       [ 0.42071967,  0.57928033],
       [ 0.38257718,  0.61742282],
       [ 0.54690485,  0.45309515],
       [ 0.65804945,  0.34195055],
       [ 0.59391744,  0.40608256],
       [ 0.50341161,  0.49658839],
       [ 0.52953768,  0.47046232],
       [ 0.22986657,  0.77013343],
       [ 0.59129094,  0.40870906],
       [ 0.63948276,  0.36051724]])

# Backup Code

In [None]:

# ----------------TimeSeriesSplit Custom
class TimeSeriesSplitCustom(TimeSeriesSplit):
    
    def __init__(self, n_splits,test_n,train_n):
        super(TimeSeriesSplitCustom, self).__init__(n_splits)
        self.test_n = test_n
        self.train_n = train_n

    def split(self, X, y=None, groups=None):
        l = X
        train_n = self.train_n
        test_n = self.test_n
        w_n = train_n+test_n
        stride=test_n
        windows = [[w[:train_n],w[train_n:]] for w in [l[o:o+w_n] for o in [i for i in range(0,len(l)-w_n+1,stride)]]]
        for x in windows:
            yield (x[0],x[1])

# ----------------DEMO
X = np.arange(1,30)
for tr, te in TimeSeriesSplitCustom(n_splits=2,test_n=5,train_n=10).split(X):
    print(tr, te)