In [1]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# Chart drawing
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

# Show charts when running kernel
init_notebook_mode(connected=True)

# Change default background color for all visualizations
layout=go.Layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(250,250,250,0.8)')
fig = go.Figure(layout=layout)
templated_fig = pio.to_templated(fig)
pio.templates['my_template'] = templated_fig.layout.template
pio.templates.default = 'my_template'

  from pandas import MultiIndex, Int64Index


In [2]:
#reading sample dataset
df = pd.read_csv('NIFTY_50__EQ__INDICES__NSE__MINUTE.csv')

# df['Date'] = pd.to_datetime(df['Date'])
# df = df[(df['Date'].dt.year >= 2010)].copy()
# df.index = range(len(df))

df.head()



Unnamed: 0,timestamp,open,high,low,close,volume
0,2017-01-02 09:15:00+05:30,8210.1,8211.7,8189.0,8189.55,0.0
1,2017-01-02 09:16:00+05:30,8188.75,8193.95,8188.75,8189.95,0.0
2,2017-01-02 09:17:00+05:30,8190.15,8190.75,8173.7,8173.7,0.0
3,2017-01-02 09:18:00+05:30,8173.35,8177.55,8169.15,8177.55,0.0
4,2017-01-02 09:19:00+05:30,8177.85,8178.15,8173.45,8174.4,0.0


In [3]:

Data = df.set_index(pd.DatetimeIndex(df['timestamp'])) #converting timestamp column to datetimeindex datatype and setting as index
from collections import OrderedDict
data = Data.resample('1d').agg(
    OrderedDict([
        ('open', 'first'),
        ('high', 'max'),
        ('low', 'min'),
        ('close', 'last'),
        ('volume', 'sum'),
    ]) 
) #a method that can be used to summarize data by date 
data.dropna(inplace=True) #dropping null values
data['Date'] = data.index

In [4]:
df = data

In [5]:
fig = make_subplots(rows=2, cols=1)
#plotting open-high-low-close and scatter plot for volume 
fig.add_trace(go.Ohlc(x=df.Date,
                      open=df.open,
                      high=df.high,
                      low=df.low,
                      close=df.close,
                      name='Price'), row=1, col=1)

fig.add_trace(go.Scatter(x=df.Date, y=df.volume, name='Volume'), row=2, col=1)

fig.update(layout_xaxis_rangeslider_visible=False)
fig.show()

In [6]:
##calculating exponential moving average and simple moving average and plotting them for comparison
df['EMA_9'] = df['close'].ewm(9).mean().shift()
df['SMA_5'] = df['close'].rolling(5).mean().shift()
df['SMA_10'] = df['close'].rolling(10).mean().shift()
df['SMA_15'] = df['close'].rolling(15).mean().shift()
df['SMA_30'] = df['close'].rolling(30).mean().shift()

fig = go.Figure()
fig.add_trace(go.Scatter(x=df.Date, y=df.EMA_9, name='EMA 9'))
fig.add_trace(go.Scatter(x=df.Date, y=df.SMA_5, name='SMA 5'))
fig.add_trace(go.Scatter(x=df.Date, y=df.SMA_10, name='SMA 10'))
fig.add_trace(go.Scatter(x=df.Date, y=df.SMA_15, name='SMA 15'))
fig.add_trace(go.Scatter(x=df.Date, y=df.SMA_30, name='SMA 30'))
fig.add_trace(go.Scatter(x=df.Date, y=df.close, name='Close', opacity=0.2))
fig.show()

In [7]:

# Relative Strength Index

# RSI indicator to predict whether a stock is overbought/oversold.
def relative_strength_idx(df, n=14):
    close = df['close']
    delta = close.diff()#calculating consecutive differences of values of closing price to get delta
    delta = delta[1:]#ignoring row 1 as the value is nan
    pricesUp = delta.copy()
    pricesDown = delta.copy()
    pricesUp[pricesUp < 0] = 0 #marking rise in price with comparison to previous value
    pricesDown[pricesDown > 0] = 0#marking fall in price with comparison to previous value
    rollUp = pricesUp.rolling(n).mean() #calculating rolling avg of priceup as rollup
    rollDown = pricesDown.abs().rolling(n).mean()#calculating rolling avg of pricedown as rolldown
    rs = rollUp / rollDown #calculating relative strength 
    rsi = 100.0 - (100.0 / (1.0 + rs))#calculating relative strength index
    return rsi

df['RSI'] = relative_strength_idx(df).fillna(0) #filling nan values with 0
#plotting scatterplot for relative strength index
fig = go.Figure(go.Scatter(x=df.Date, y=df.RSI, name='RSI'))
fig.show()

In [8]:
 #MACD - moving average convergence diverge - shows relationship between 2 moving averages
EMA_12 = pd.Series(df['close'].ewm(span=12, min_periods=12).mean())
EMA_26 = pd.Series(df['close'].ewm(span=26, min_periods=26).mean())
df['MACD'] = pd.Series(EMA_12 - EMA_26)
df['MACD_signal'] = pd.Series(df.MACD.ewm(span=9, min_periods=9).mean())

fig = make_subplots(rows=2, cols=1)
fig.add_trace(go.Scatter(x=df.Date, y=df.close, name='close'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=EMA_12, name='EMA 12'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=EMA_26, name='EMA 26'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=df['MACD'], name='MACD'), row=2, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=df['MACD_signal'], name='Signal line'), row=2, col=1)
fig.show()

In [9]:
#Shift label column

# Because to predict the next day price, after calculating all features for day Di
# , shift Close price column by -1 rows. After doing that, for day Di we have features from the same timestamp e.g. RSIi, but the price Ci+1 from upcoming day.
df['Close'] = df['close'].shift(-1)

In [10]:
#Drop invalid samples

# Because of calculating moving averages and shifting label column, few rows will have invalid values i.e. we haven't calculated SMA10
# for the first 10 days. Moreover, after shifting Close price column, last row price is equal to 0 which is not true. Removing these samples should help.

df = df.iloc[33:] # Because of moving averages and MACD line
df = df[:-1]      # Because of shifting close price

df.index = range(len(df))

In [11]:
#spliting to train and test and validate
test_size  = 0.15
valid_size = 0.15

test_split_idx  = int(df.shape[0] * (1-test_size))
valid_split_idx = int(df.shape[0] * (1-(valid_size+test_size)))

train_df  = df.loc[:valid_split_idx].copy()
valid_df  = df.loc[valid_split_idx+1:test_split_idx].copy()
test_df   = df.loc[test_split_idx+1:].copy()

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_df.Date, y=train_df.Close, name='Training'))
fig.add_trace(go.Scatter(x=valid_df.Date, y=valid_df.Close, name='Validation'))
fig.add_trace(go.Scatter(x=test_df.Date,  y=test_df.Close,  name='Test'))
fig.show()

In [12]:
#dropping below listed column from train test and validate columns
drop_cols = ['close']

train_df = train_df.drop(drop_cols, 1)
valid_df = valid_df.drop(drop_cols, 1)
test_df  = test_df.drop(drop_cols, 1)



In [13]:
train_df.head()

Unnamed: 0,open,high,low,volume,Date,EMA_9,SMA_5,SMA_10,SMA_15,SMA_30,RSI,MACD,MACD_signal,Close
0,8883.7,8885.9,8804.75,0.0,2017-02-17 00:00:00+05:30,8689.918887,8777.25,8775.4,8734.933333,8557.245,67.095811,78.507448,83.985408,8884.25
1,8818.55,8885.35,8809.85,0.0,2017-02-20 00:00:00+05:30,8702.985415,8781.83,8783.24,8746.6,8575.46,81.144269,82.490546,83.650472,8903.25
2,8890.75,8920.5,8861.05,0.0,2017-02-21 00:00:00+05:30,8721.577256,8798.27,8792.055,8763.303333,8596.661667,74.310501,86.085217,84.183181,8930.1
3,8931.45,8960.6,8905.5,0.0,2017-02-22 00:00:00+05:30,8740.163243,8822.09,8805.305,8786.416667,8618.951667,75.578533,89.935288,85.418492,8941.2
4,8952.55,8981.9,8928.1,0.0,2017-02-23 00:00:00+05:30,8759.549996,8862.47,8820.735,8800.723333,8640.278333,75.26493,92.756006,86.971365,8896.45


In [14]:
#getting x_train y_train x_test y_test x_valid y_valid
y_train = train_df['Close'].copy()
X_train = train_df.drop(['Close'], 1)

y_valid = valid_df['Close'].copy()
X_valid = valid_df.drop(['Close'], 1)

y_test  = test_df['Close'].copy()
X_test  = test_df.drop(['Close'], 1)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 579 entries, 0 to 578
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype                                
---  ------       --------------  -----                                
 0   open         579 non-null    float64                              
 1   high         579 non-null    float64                              
 2   low          579 non-null    float64                              
 3   volume       579 non-null    float64                              
 4   Date         579 non-null    datetime64[ns, pytz.FixedOffset(330)]
 5   EMA_9        579 non-null    float64                              
 6   SMA_5        579 non-null    float64                              
 7   SMA_10       579 non-null    float64                              
 8   SMA_15       579 non-null    float64                              
 9   SMA_30       579 non-null    float64                              
 10  RSI          579 non-null 

In [None]:
#fine tunning

parameters = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.001, 0.005, 0.01, 0.05],
    'max_depth': [8, 10, 12, 15],
    'gamma': [0.001, 0.005, 0.01, 0.02],
    'random_state': [42]
}

#defining model
eval_set = [(X_train, y_train), (X_valid, y_valid)]
model = xgb.XGBRegressor(eval_set=eval_set, objective='reg:squarederror', verbose=False)
#finding best parameters using grid search
clf = GridSearchCV(model, parameters)
#fitting model with train test data
clf.fit(X_train, y_train)

print(f'Best params: {clf.best_params_}')
print(f'Best validation score = {clf.best_score_}')

In [None]:
#defining model with best parameters using grid search 
model = xgb.XGBRegressor(**clf.best_params_, objective='reg:squarederror')
model.fit(X_train, y_train, eval_set=eval_set, verbose=False)

In [None]:


plot_importance(model);



In [None]:

#predecting values for x_test
y_pred = model.predict(X_test)
print(f'y_true = {np.array(y_test)[:5]}')
print(f'y_pred = {y_pred[:5]}')



In [None]:
#checking mean squared error
print(f'mean_squared_error = {mean_squared_error(y_test, y_pred)}')

In [None]:
#comparing predicted results with actual values
predicted_prices = df.loc[test_split_idx+1:].copy()
predicted_prices['Close'] = y_pred

fig = make_subplots(rows=2, cols=1)
fig.add_trace(go.Scatter(x=df.Date, y=df.Close,
                         name='Truth',
                         marker_color='LightSkyBlue'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=predicted_prices.Close,
                         name='Prediction',
                         marker_color='MediumPurple'), row=1, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_test,
                         name='Truth',
                         marker_color='LightSkyBlue',
                         showlegend=False), row=2, col=1)

fig.add_trace(go.Scatter(x=predicted_prices.Date,
                         y=y_pred,
                         name='Prediction',
                         marker_color='MediumPurple',
                         showlegend=False), row=2, col=1)

fig.show()