In [43]:
%matplotlib inline
import pandas as pd
import numpy as np
import talib

import matplotlib.pyplot as plt

from plotly.subplots import make_subplots

import plotly.express as px

import plotly.graph_objects as go

In [44]:
pd.set_option('display.max_columns', None)

In [45]:
df = pd.read_csv('INTC.csv')

df.Date = pd.to_datetime(df.Date)

df = df[(df.Date >= '2018-01-01')]

df.sort_values(by='Date', inplace=True)

df['sma_15'] = talib.SMA(df['Adj Close'], timeperiod=15)

df['sma_15_diff'] = df['sma_15'].diff()

df['Trend'] = 'no'

start_ = min(df.index) + 20

for d, row in df[df.index > start_].iterrows():
    if sum(df[(df.index <= d) & (df.index > d - 5)].sma_15_diff < 0) == 5 and row['sma_15'] > row['Adj Close']:
        df.loc[d, 'Trend'] = 'down'
    elif sum(df[(df.index <= d) & (df.index > d - 5)].sma_15_diff > 0) == 5 and row['sma_15'] < row['Adj Close']:
        df.loc[d, 'Trend'] = 'up'

df['Trading Signal'] = 0

hold = 0

for d, row in df[df.index > start_].iterrows():
    try:
        min_ = min(df[(df.index >= d) & (df.index < d + 3)]['Adj Close'])
        max_ = max(df[(df.index >= d) & (df.index < d + 3)]['Adj Close'])
    except:
        print("Can't look ahead")
        break
    if max_ != min_:
        if row['Trend'] == 'up':
            hold = 0.5
            df.loc[d, 'Trading Signal'] = (row['Adj Close'] - min_)*0.5/(max_ - min_) + hold

        elif row['Trend'] == 'down':
            hold = 0
            df.loc[d, 'Trading Signal'] = (row['Adj Close'] - min_)*0.5/(max_ - min_) + hold
        else:
            df.loc[d, 'Trading Signal'] = (row['Adj Close'] - min_)*0.5/(max_ - min_) + hold

In [46]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sma_15,sma_15_diff,Trend,Trading Signal
10219,2020-09-24,48.529999,49.639999,48.419998,49.16,49.16,29343400,49.658667,-0.206,no,0.5
10220,2020-09-25,48.959999,50.279999,48.75,49.939999,49.939999,26633400,49.628667,-0.03,no,0.5
10221,2020-09-28,50.509998,51.459999,50.16,51.43,51.43,29652200,49.718666,0.09,no,0.703391
10222,2020-09-29,51.310001,51.740002,50.950001,51.189999,51.189999,19558200,49.870666,0.152,no,0.5
10223,2020-09-30,51.240002,52.380001,51.150002,51.779999,51.779999,27711300,50.014666,0.144,no,0.0


In [47]:
fig = go.Figure()

# fig = px.scatter(df, x="Date", y="Adj Close", color="Trading Signal",
#                  title="Numeric 'size' values mean continous color")

# fig1 = go.Candlestick(x=df['Date'],
#                 open=df['Open'],
#                 high=df['High'],
#                 low=df['Low'],
#                 close=df['Close'])

# Add traces
fig.add_trace(go.Scatter(x=df['Date'], y=df['Adj Close'],
                    mode='lines',
                    name='markers'))

# fig.add_trace(fig1)

fig2 = go.Scatter(
    x=df['Date'], y=df['Adj Close'],
    mode='markers',
    marker=dict(
        size=8,
        color=df["Trading Signal"], #set color equal to a variable
        colorscale='rdylgn', # one of plotly colorscales
        showscale=True, reversescale=True
    ), name='Trading Signal'
)

fig.add_trace(fig2)

fig.update_layout(showlegend=False)

fig.update_xaxes(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25)

fig.update_yaxes(
        title_text = "Price",
        title_standoff = 25)

fig.show()

In [48]:
def assignFactor(df):
                
    df['priceROC'] = (df['Adj Close'] - df['Adj Close'].shift(1))/df['Adj Close'].shift(1)
    df['volumeROC'] = (df['Volume'] - df['Volume'].shift(1))/df['Volume'].shift(1)

    
    df['RSIClose'] = talib.RSI(df['Adj Close'])
    df['RSIVolume'] = talib.RSI(df['Volume'])
    
    df['WILLR'] = talib.WILLR(df.High.values, df.Low.values, df.Close.values)
    
    macd, macdsignal, macdhist = talib.MACD(df['Adj Close'], fastperiod=12, slowperiod=26, signalperiod=9)
    df['macd'] = macd
    df['macdsignal'] = macdsignal
    
    slowk, slowd = talib.STOCH(df.High.values, df.Low.values, df.Close.values, 5, 3, 0, 3, 0)

    df['slowk'], df['slowd'] = slowk, slowd
    
    df['ADX'] = talib.ADX(df['High'], df['Low'], df['Close'], timeperiod=14)
    
    # Volume indicator
    # CMF = n-day Sum of [(((C - L) - (H - C)) / (H - L)) x Vol] / n-day Sum of Vol
    cmf_hold = ( ( df['Close'] - df['Low'] ) - ( df['High'] - df['Close'] ) ) / ( df['High'] - df['Low'] ) * df['Volume']
    df['CMF'] = cmf_hold.rolling(14).sum()/df['Volume'].rolling(14).sum()
    del cmf_hold
    
    df['ChaikinOscillator'] = talib.ADOSC(df['High'], df['Low'], df['Close'], 
                                          df['Volume'], fastperiod=3, slowperiod=10)
    
    df['14DayStDevPrice'] = talib.STDDEV(df['Adj Close'], timeperiod=14, nbdev=1)

    df['upperbandPrice'], df['middlebandPrice'], df['lowerbandPrice'] = talib.BBANDS(df['Adj Close'], 
                                                            timeperiod=14, nbdevup=1, nbdevdn=1, matype=0)
    
    # How many standard deviations is the current price from 80-day moving average
    df['offBy14DayStDevPrice'] = ( df['Adj Close'] - df['middlebandPrice'] ) / df['14DayStDevPrice']
    
    df['14DayStDevVolume'] = talib.STDDEV(df['Volume'], timeperiod=14, nbdev=1)

    df['upperbandVolume'], df['middlebandVolume'], df['lowerbandVolume'] = talib.BBANDS(df['Volume'], 
                                                            timeperiod=14, nbdevup=1, nbdevdn=1, matype=0)
    
    # How many standard deviations is the current Volume from 80-day moving average
    df['offBy14DayStDevVolume'] = ( df['Volume'] - df['middlebandVolume'] ) / df['14DayStDevVolume']
    
    df.dropna(inplace=True)
    
    return df


In [49]:
df = assignFactor(df)

In [50]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,sma_15,sma_15_diff,Trend,Trading Signal,priceROC,volumeROC,RSIClose,RSIVolume,WILLR,macd,macdsignal,slowk,slowd,ADX,CMF,ChaikinOscillator,14DayStDevPrice,upperbandPrice,middlebandPrice,lowerbandPrice,offBy14DayStDevPrice,14DayStDevVolume,upperbandVolume,middlebandVolume,lowerbandVolume,offBy14DayStDevVolume
9565,2018-02-20,45.459999,46.709999,45.389999,46.32,43.620476,25512000,42.876189,-0.208635,no,1.0,0.016681,0.187975,52.115881,48.108101,-40.638008,0.42095,0.603686,81.0763,83.178569,17.79553,-0.055685,9881602.0,1.202,43.880995,42.678995,41.476994,0.783262,11940760.0,45758020.0,33817260.0,21876500.0,-0.695538
9566,2018-02-21,46.669998,47.060001,45.919998,45.939999,43.262623,27527500,42.717903,-0.158286,no,0.536271,-0.008204,0.079002,50.411962,48.776146,-39.346843,0.414525,0.565854,69.304998,78.359243,16.676241,-0.086803,926864.3,1.02897,43.5818,42.552831,41.523861,0.689809,11834090.0,44644740.0,32810660.0,20976570.0,-0.446436
9567,2018-02-22,46.139999,46.560001,45.619999,45.799999,43.130775,16946400,42.59136,-0.126543,no,0.5,-0.003048,-0.384383,49.766355,45.466866,-34.60873,0.39425,0.531533,62.863797,71.081698,15.849094,-0.0733,-6147703.0,0.883884,43.333871,42.449987,41.566103,0.770224,12496990.0,44458070.0,31961090.0,19464100.0,-1.201465
9568,2018-02-23,46.34,47.790001,46.310001,47.73,44.948299,26040900,42.616541,0.025181,no,0.5,0.04214,0.536663,57.791081,48.689179,-1.043496,0.518859,0.528998,66.899356,66.35605,15.240155,0.040684,-869049.8,1.083569,43.660754,42.577185,41.493616,2.188245,12564480.0,44338270.0,31773790.0,19209310.0,-0.456278
9569,2018-02-26,48.200001,49.130001,48.099998,49.110001,46.247875,26992300,42.821898,0.205357,no,0.5,0.028913,0.036535,62.414488,49.028494,-0.282087,0.714245,0.566048,79.083156,69.615436,15.480782,0.215157,9624332.0,1.402141,44.308255,42.906114,41.503973,2.383327,11780130.0,42062730.0,30282600.0,18502470.0,-0.279309


# Split into Training, Validation and Testing Set

In [52]:
# ['sma_15', 'macd', 'macdsignal', 'slowk', 'slowd', 'RSI', 'WILLR', 'CMF', 'ChaikinOscillator'] 'macd', 'macdsignal'

# features = ['slowk', 'slowd', 'macd', 'macdsignal', 'RSIClose', 'RSIVolume', 'WILLR', 'CMF', 'priceROC', 'volumeROC', 'ADX', 
#             'offBy14DayStDevPrice', 'offBy14DayStDevVolume', 'ChaikinOscillator']

features = ['RSIClose', 'RSIVolume', 'WILLR', 'CMF', 'priceROC', 'volumeROC', 'ADX', 
            'offBy14DayStDevPrice', 'offBy14DayStDevVolume']

print('Number of features: ', len(features))

df_train = df[(df.Date >= '2019-01-01') & (df.Date < '2020-01-01') ]
y_train = df_train['Trading Signal'].values
df_train = df_train[features]

print(len(df_train))
print(len(y_train))

df_valid = df[(df.Date >= '2020-01-01') & (df.Date < '2020-05-01') ]
y_valid = df_valid['Trading Signal'].values
df_valid = df_valid[features]

print(len(df_valid))
print(len(y_valid))

df_test = df[(df.Date >= '2020-05-01') ]
y_test = df_test['Trading Signal'].values
df_test = df_test[features]

print(len(df_test))
print(len(y_test))

Number of features:  9
252
252
83
83
106
106


## Transform features by scaling each feature in the training set to be between 0 and 1

In [53]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [54]:
scaler.fit(df_train)
X_train = scaler.transform(df_train)

X_valid = scaler.transform(df_valid)

X_test = scaler.transform(df_test)

# Gradient Boosting

In [55]:
import random
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

### First we will use random search cross validation to narrow down the range of combinations of parameters where we need to optimize.

In [56]:
best_mse = 1e99

for i in range(500):
    loss = random.choice(['ls', 'lad', 'huber', 'quantile'])
    learning_rate = np.random.random(1)[0]
    n_estimators = np.random.randint(2, 50)
    criterion = random.choice(['friedman_mse', 'mse', 'mae'])
    min_samples_split = np.random.randint(2, 50)
    min_samples_leaf = np.random.randint(2, 50)
    max_depth = np.random.randint(2, 50)
        
    gbr = GradientBoostingRegressor(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, 
                         criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
                                        max_depth=max_depth)
    
    gbr.fit(X_train, y_train)
    
    print('Done iteration: ', i)
    
    y_pred = gbr.predict(X_valid)
    mse = mean_squared_error(y_valid, y_pred)
    
    print('mse: ', mse)
    
    if mse < best_mse:
        best_mse = mse
        best_loss= loss
        best_learning_rate = learning_rate
        best_n_estimators = n_estimators
        best_criterion = criterion
        best_min_samples_split = min_samples_split
        best_min_samples_leaf = min_samples_leaf
        best_max_depth = max_depth
        best_gbr = gbr


Done iteration:  0
mse:  0.11438755604434603
Done iteration:  1
mse:  0.09186539815137498
Done iteration:  2
mse:  0.09390049727188676
Done iteration:  3
mse:  0.3156964242109716
Done iteration:  4
mse:  0.3156964242109716
Done iteration:  5
mse:  0.10781309359843692
Done iteration:  6
mse:  0.09599141088208543
Done iteration:  7
mse:  0.09253008470103885
Done iteration:  8
mse:  0.3156964242109716
Done iteration:  9
mse:  0.11049127033704684
Done iteration:  10
mse:  0.12496719754318306
Done iteration:  11
mse:  0.10923074294803829
Done iteration:  12
mse:  0.11073126961370212
Done iteration:  13
mse:  0.10918977640911048
Done iteration:  14
mse:  0.1306106358336951
Done iteration:  15
mse:  0.09564551335930953
Done iteration:  16
mse:  0.10229181932338634
Done iteration:  17
mse:  0.10734223775778531
Done iteration:  18
mse:  0.145900736218211
Done iteration:  19
mse:  0.1548066688147421
Done iteration:  20
mse:  0.3156964242109716
Done iteration:  21
mse:  0.12100039141748949
Done i

Done iteration:  181
mse:  0.10006791408812128
Done iteration:  182
mse:  0.09680826790313803
Done iteration:  183
mse:  0.10002662428613003
Done iteration:  184
mse:  0.09975288547141153
Done iteration:  185
mse:  0.09296014209101852
Done iteration:  186
mse:  0.11749137198797392
Done iteration:  187
mse:  0.10157243913996306
Done iteration:  188
mse:  0.11618390873799293
Done iteration:  189
mse:  0.3156964242109716
Done iteration:  190
mse:  0.10770542001638811
Done iteration:  191
mse:  0.10069491621525815
Done iteration:  192
mse:  0.1366000683465535
Done iteration:  193
mse:  0.3156964242109716
Done iteration:  194
mse:  0.1052457816614093
Done iteration:  195
mse:  0.10654991091149679
Done iteration:  196
mse:  0.09870414524033692
Done iteration:  197
mse:  0.3156964242109716
Done iteration:  198
mse:  0.3156964242109716
Done iteration:  199
mse:  0.10263931149689379
Done iteration:  200
mse:  0.10302757118473699
Done iteration:  201
mse:  0.3156964242109716
Done iteration:  202

Done iteration:  362
mse:  0.10913531248112211
Done iteration:  363
mse:  0.0840329988325934
Done iteration:  364
mse:  0.0928653646749096
Done iteration:  365
mse:  0.11207175790491324
Done iteration:  366
mse:  0.3156964242109716
Done iteration:  367
mse:  0.3156964242109716
Done iteration:  368
mse:  0.3156964242109716
Done iteration:  369
mse:  0.3156964242109716
Done iteration:  370
mse:  0.11530741128423011
Done iteration:  371
mse:  0.3156964242109716
Done iteration:  372
mse:  0.09956949747677235
Done iteration:  373
mse:  0.10000701976055958
Done iteration:  374
mse:  0.11650956254150159
Done iteration:  375
mse:  0.09427986101250461
Done iteration:  376
mse:  0.09780063667528663
Done iteration:  377
mse:  0.08769957170406129
Done iteration:  378
mse:  0.3156964242109716
Done iteration:  379
mse:  0.08741203354018116
Done iteration:  380
mse:  0.11177964300695611
Done iteration:  381
mse:  0.3156964242109716
Done iteration:  382
mse:  0.3156964242109716
Done iteration:  383
ms

In [57]:
best_learning_rate

0.20047471514358128

In [58]:
y_pred = best_gbr.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)

mse

0.0840329988325934

### Grid Search with Cross Validation

In [59]:
params = ['max_depth', 'min_samples_leaf', 'min_samples_split', 
         'n_estimators']

param_grid = {}

for p in params:
    val = eval(f'best_{p}')
    if val > 5:
        param_grid[p] = np.arange(val - 4, val + 6, 2)
    else:
        param_grid[p] = np.arange(2, 6)

print(param_grid)
        
# Use gridsearch to find the best parameters
i=1
# for criterion in param_grid['criterion']:
# for max_features in param_grid['max_features']:
for max_depth in param_grid['max_depth']:
    for min_samples_leaf in param_grid['min_samples_leaf']:
        for min_samples_split in param_grid['min_samples_split']:
            for n_estimators in param_grid['n_estimators']:

                gbr = GradientBoostingRegressor(loss=best_loss, learning_rate=best_learning_rate, 
                    n_estimators=n_estimators, criterion=best_criterion, min_samples_split=min_samples_split, 
                    min_samples_leaf=min_samples_leaf, max_depth=max_depth)

                gbr.fit(X_train, y_train)
                
                print('Done iteration: ', i)

                y_pred = gbr.predict(X_valid)
                mse = mean_squared_error(y_valid, y_pred)

                print('mse: ', mse)

                if mse < best_mse:
                    best_mse = mse
                    best_gbr = gbr
                i+=1

{'max_depth': array([2, 3, 4, 5]), 'min_samples_leaf': array([ 3,  5,  7,  9, 11]), 'min_samples_split': array([11, 13, 15, 17, 19]), 'n_estimators': array([27, 29, 31, 33, 35])}
Done iteration:  1
mse:  0.09199487500227459
Done iteration:  2
mse:  0.09379029760157125
Done iteration:  3
mse:  0.09332451074482036
Done iteration:  4
mse:  0.0991187719575957
Done iteration:  5
mse:  0.09888326828046504
Done iteration:  6
mse:  0.09199487500227459
Done iteration:  7
mse:  0.09379029760157126
Done iteration:  8
mse:  0.09332451074482036
Done iteration:  9
mse:  0.0991187719575957
Done iteration:  10
mse:  0.09888564103379925
Done iteration:  11
mse:  0.09199487500227459
Done iteration:  12
mse:  0.09379029760157125
Done iteration:  13
mse:  0.09332451074482036
Done iteration:  14
mse:  0.0991187719575957
Done iteration:  15
mse:  0.09888564103379925
Done iteration:  16
mse:  0.09199487500227459
Done iteration:  17
mse:  0.09379029760157125
Done iteration:  18
mse:  0.09332451074482036
Done 

Done iteration:  188
mse:  0.08768702118833896
Done iteration:  189
mse:  0.0886269998311964
Done iteration:  190
mse:  0.09017879416931236
Done iteration:  191
mse:  0.08620448672805012
Done iteration:  192
mse:  0.0863529623821234
Done iteration:  193
mse:  0.08734332165289958
Done iteration:  194
mse:  0.08830172986052677
Done iteration:  195
mse:  0.08985891843893756
Done iteration:  196
mse:  0.08894240743688132
Done iteration:  197
mse:  0.08833807441098636
Done iteration:  198
mse:  0.08890635564595302
Done iteration:  199
mse:  0.08910629047203858
Done iteration:  200
mse:  0.0894075702934068
Done iteration:  201
mse:  0.08762625587897845
Done iteration:  202
mse:  0.08847138074920424
Done iteration:  203
mse:  0.08960140123337827
Done iteration:  204
mse:  0.0918373476475505
Done iteration:  205
mse:  0.09218069809313356
Done iteration:  206
mse:  0.08762625587897845
Done iteration:  207
mse:  0.08847138074920424
Done iteration:  208
mse:  0.08960140123337827
Done iteration:  

Done iteration:  372
mse:  0.09361229139740022
Done iteration:  373
mse:  0.09472460514455945
Done iteration:  374
mse:  0.09538205195053324
Done iteration:  375
mse:  0.09467868238090628
Done iteration:  376
mse:  0.09813423442986181
Done iteration:  377
mse:  0.09848753059820968
Done iteration:  378
mse:  0.0979489551030768
Done iteration:  379
mse:  0.09883011807383286
Done iteration:  380
mse:  0.1005717759046161
Done iteration:  381
mse:  0.11295181935659787
Done iteration:  382
mse:  0.11440385007247506
Done iteration:  383
mse:  0.11389919935376923
Done iteration:  384
mse:  0.1133396440943988
Done iteration:  385
mse:  0.1137832905828998
Done iteration:  386
mse:  0.10669139984817771
Done iteration:  387
mse:  0.0952420174118151
Done iteration:  388
mse:  0.09533874696955708
Done iteration:  389
mse:  0.0966304095635458
Done iteration:  390
mse:  0.10609315375655916
Done iteration:  391
mse:  0.09305060327727699
Done iteration:  392
mse:  0.10127285948146911
Done iteration:  39

In [60]:
best_mse

0.08351767905704212

In [61]:
y_pred = best_gbr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

mse

0.11994190543777422

In [62]:
best_gbr.fit(X_train, y_train)

y_pred = best_gbr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

mse

0.11994190543777421

In [63]:
df_test = df[(df.Date >= '2020-05-01') ]
df_test['pred'] = y_pred

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=df_test['Date'], y=df_test['Adj Close'],
                    mode='lines',
                    name='markers'))

# fig.add_trace(fig1)

fig2 = go.Scatter(
    x=df_test['Date'], y=df_test['Adj Close'],
    mode='markers',
    marker=dict(
        size=8,
        color=df_test["pred"], #set color equal to a variable
        colorscale='rdylgn', # one of plotly colorscales
        showscale=True, reversescale=True, cmax=1, cmin=0
    ), name='Trading Signal'
)

fig.add_trace(fig2)

fig.update_layout(showlegend=False)

fig.update_xaxes(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25)

fig.update_yaxes(
        title_text = "Price",
        title_standoff = 25)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Now that we have optimized parameters for the GBR algorithm, lets retrain the algorithm on the most recent year before the start of the test data.

In [111]:
df_train = df[(df.Date >= '2019-05-01') & (df.Date < '2020-05-01') ]
y_train = df_train['Trading Signal'].values
df_train = df_train[features]

print(len(df_train))
print(len(y_train))

df_test = df[(df.Date >= '2020-05-01') ]
y_test = df_test['Trading Signal'].values
df_test = df_test[features]

print(len(df_test))
print(len(y_test))

253
253
106
106


In [112]:
scaler.fit(df_train)
X_train = scaler.transform(df_train)

X_test = scaler.transform(df_test)

In [113]:
best_gbr.fit(X_train, y_train)

y_pred = best_gbr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

mse

0.1311829431391249

In [114]:
df_test = df[(df.Date >= '2020-05-01') ]
df_test['pred'] = y_pred

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=df_test['Date'], y=df_test['Adj Close'],
                    mode='lines',
                    name='markers'))

# fig.add_trace(fig1)

fig2 = go.Scatter(
    x=df_test['Date'], y=df_test['Adj Close'],
    mode='markers',
    marker=dict(
        size=8,
        color=df_test["pred"], #set color equal to a variable
        colorscale='rdylgn', # one of plotly colorscales
        showscale=True, reversescale=True, cmax=1, cmin=0
    ), name='Trading Signal'
)

fig.add_trace(fig2)

fig.update_layout(showlegend=False)

fig.update_xaxes(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25)

fig.update_yaxes(
        title_text = "Price",
        title_standoff = 25)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [115]:
# fig.write_html('final_results.html')

# Cross-Validation

In [116]:
from sklearn.model_selection import GridSearchCV

In [125]:
params = ['max_depth', 'min_samples_leaf', 'min_samples_split', 
         'n_estimators']

param_grid = {}

for p in params:
    val = eval(f'best_{p}')
    if val > 5:
        param_grid[p] = np.arange(val - 4, val + 6, 2)
    else:
        param_grid[p] = np.arange(2, 6)

param_grid['n_estimators'] = np.array([30, 35, 40, 45])
# param_grid['loss'] = ['ls', 'lad', 'huber', 'quantile']
param_grid['loss'] = ['huber']
param_grid['criterion'] = ['friedman_mse', 'mse', 'mae']
param_grid['learning_rate'] = [0.01, 0.1]
print(param_grid)



{'max_depth': array([2, 3, 4, 5]), 'min_samples_leaf': array([ 3,  5,  7,  9, 11]), 'min_samples_split': array([11, 13, 15, 17, 19]), 'n_estimators': array([30, 35, 40, 45]), 'loss': ['huber'], 'criterion': ['friedman_mse', 'mse', 'mae'], 'learning_rate': [0.01, 0.1]}


In [126]:
# Create a classifier object with the classifier and parameter candidates
reg = GridSearchCV(estimator=GradientBoostingRegressor(), param_grid=param_grid, n_jobs=-1, cv=5, verbose=2, 
                   scoring='neg_mean_squared_error')

# Train the classifier on data1's feature and target data
reg.fit(X_train, y_train)   

Fitting 5 folds for each of 2400 candidates, totalling 12000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 1196 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 2328 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 3788 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done 5568 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 6762 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 7976 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 9354 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 10892 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 11821 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 12000 out of 12000 | elapsed:  5.0min finished

The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-s

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter...
             iid='warn', n_jo

In [127]:
# View the accuracy score
print('Best score for data1:', reg.best_score_) 

Best score for data1: -0.0816051992944665


In [128]:
# Apply the classifier trained using data1 to data2, and view the accuracy score
reg.score(X_train, y_train)  

-0.05208880000819362

In [129]:
# View the best parameters for the model found using grid search
# ['max_depth', 'min_samples_leaf', 'min_samples_split', 'n_estimators']

print('Best max_depth:',reg.best_estimator_.max_depth) 
print('Best min_samples_leaf:',reg.best_estimator_.min_samples_leaf)
print('Best min_samples_split:',reg.best_estimator_.min_samples_split)
print('Best n_estimators:',reg.best_estimator_.n_estimators)
print('Best loss:',reg.best_estimator_.loss)
print('Best learning_rate:',reg.best_estimator_.learning_rate)


Best max_depth: 2
Best min_samples_leaf: 5
Best min_samples_split: 13
Best n_estimators: 35
Best loss: huber
Best learning_rate: 0.1


In [130]:
# Train a new classifier using the best parameters found by the grid search
best_gbr = GradientBoostingRegressor(max_depth=reg.best_estimator_.max_depth, 
                          min_samples_leaf=reg.best_estimator_.min_samples_leaf, 
                          min_samples_split=reg.best_estimator_.min_samples_leaf, 
                          n_estimators=reg.best_estimator_.n_estimators,
                          loss=reg.best_estimator_.loss, 
                          learning_rate=reg.best_estimator_.learning_rate
                         ).fit(X_train, y_train)

In [131]:
best_gbr.fit(X_train, y_train)

y_pred = best_gbr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

mse

0.13713981974650635

In [132]:
df_test = df[(df.Date >= '2020-05-01') ]
df_test['pred'] = y_pred

# fig = go.Figure()

fig = make_subplots(rows=2, cols=1, 
                    shared_xaxes=True, 
                    vertical_spacing=0.02)

# Add traces
fig.add_trace(go.Scatter(x=df_test['Date'], y=df_test['Adj Close'],
                    mode='lines',
                    name='markers'))

# fig.add_trace(fig1)

fig2 = go.Scatter(
    x=df_test['Date'], y=df_test['Adj Close'],
    mode='markers',
    marker=dict(
        size=8,
        color=df_test["pred"], #set color equal to a variable
        colorscale='rdylgn', # one of plotly colorscales
        showscale=True, reversescale=True, cmax=1, cmin=0
    ), name='Trading Signal'
)

fig.add_trace(fig2)

fig.add_trace(go.Scatter(x=df_test['Date'], y=df_test['Volume'],
                    mode='lines',
                    name='Volume'), row=2, col=1)

fig.update_layout(showlegend=False)

fig.update_xaxes(
        title_text = "Date",
        title_font = {"size": 20},
        title_standoff = 25)

fig.update_yaxes(
        title_text = "Price",
        title_standoff = 25)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

