In [1]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# Chart drawing
import plotly as py
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

# Mute sklearn warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

# Show charts when running kernel
init_notebook_mode(connected=True)

# Change default background color for all visualizations
layout=go.Layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(250,250,250,0.8)')
fig = go.Figure(layout=layout)
templated_fig = pio.to_templated(fig)
pio.templates['my_template'] = templated_fig.layout.template
pio.templates.default = 'my_template'

  from pandas import MultiIndex, Int64Index


In [None]:
#reading sample dataset 
df = pd.read_csv('NSEI.csv')

# df['Date'] = pd.to_datetime(df['Date'])
# df = df[(df['Date'].dt.year >= 2010)].copy()
# df.index = range(len(df))

df.head()



In [None]:
#setting date as the index for dataframe

df = df.set_index(pd.DatetimeIndex(df['Date']))
# from collections import OrderedDict
# data = Data.resample('1d').agg(
#     OrderedDict([
#         ('open', 'first'),
#         ('high', 'max'),
#         ('low', 'min'),
#         ('close', 'last'),
#         ('volume', 'sum'),
#     ])
# )
# data.dropna(inplace=True)
# data['Date'] = data.index

In [None]:
# df = data

In [None]:
#calculating price shift attribute 
df['price_shift'] = df['close'].shift(periods=1)

In [None]:
#setting timestamp column to index of dataframe 
df['timestamp'] = df.index

In [None]:
#getting day name from timestamp and assigning to dayofweek attribute
df['dayoftheWeek'] = df.timestamp.dt.day_name()

In [None]:
df.head()

In [None]:
#assigning target column to values true and false depending on close price being greater than or less corresponding than price shift values respectively
df['target']=(df.close.gt(df.price_shift)).astype(int)

In [None]:
#removing first row to remove price shift nan value 
df = df[1:]

In [None]:
df.head()

In [None]:
#group the dataframe by day of week and assign to grouped_by_user_df and calculate listed aggregate values by dayofweek and assing to user_answers_df
grouped_by_user_df = df.groupby('dayoftheWeek')
user_answers_df = grouped_by_user_df.agg({'target': ['mean', 'count', 'std', 'median', 'skew']}).copy()

In [None]:
#merge user_answers_df with df using dayofweek

df = df.merge(user_answers_df, how='left', on='dayoftheWeek')

In [None]:
#displaying df for weekdays
df[(df.dayoftheWeek != 'Saturday') | (df.dayoftheWeek != 'Sunday')]

In [None]:
#plotting open-high-low-close for dataframe and plotting scatter plot for volume
fig = make_subplots(rows=2, cols=1)

fig.add_trace(go.Ohlc(x=df.Date,
                      open=df.open,
                      high=df.high,
                      low=df.low,
                      close=df.close,
                      name='Price'), row=1, col=1)

fig.add_trace(go.Scatter(x=df.Date, y=df.volume, name='Volume'), row=2, col=1)

fig.update(layout_xaxis_rangeslider_visible=False)
fig.show()

In [None]:
#calculating exponential moving average and simple moving average and plotting them for comparison
#MA
df['EMA_9'] = df['close'].ewm(9).mean().shift()
df['SMA_5'] = df['close'].rolling(5).mean().shift()
df['SMA_10'] = df['close'].rolling(10).mean().shift()
df['SMA_15'] = df['close'].rolling(15).mean().shift()
df['SMA_30'] = df['close'].rolling(30).mean().shift()

fig = go.Figure()
fig.add_trace(go.Scatter(x=df.Date, y=df.EMA_9, name='EMA 9'))
fig.add_trace(go.Scatter(x=df.Date, y=df.SMA_5, name='SMA 5'))
fig.add_trace(go.Scatter(x=df.Date, y=df.SMA_10, name='SMA 10'))
fig.add_trace(go.Scatter(x=df.Date, y=df.SMA_15, name='SMA 15'))
fig.add_trace(go.Scatter(x=df.Date, y=df.SMA_30, name='SMA 30'))
fig.add_trace(go.Scatter(x=df.Date, y=df.close, name='Close', opacity=0.2))
fig.show()

In [None]:

# Relative Strength Index

# RSI indicator to predict whether a stock is overbought/oversold.
def relative_strength_idx(df, n=14):
    close = df['close']
    delta = close.diff() #calculating consecutive differences of values of closing price to get delta
    delta = delta[1:] #ignoring row 1 as the value is nan
    pricesUp = delta.copy() 
    pricesDown = delta.copy()
    pricesUp[pricesUp < 0] = 0 #marking rise in price with comparison to previous value
    pricesDown[pricesDown > 0] = 0 #marking fall in price with comparison to previous value
    rollUp = pricesUp.rolling(n).mean() #calculating rolling avg of priceup as rollup
    rollDown = pricesDown.abs().rolling(n).mean() #calculating rolling avg of pricedown as rolldown
    rs = rollUp / rollDown #calculating relative strength 
    rsi = 100.0 - (100.0 / (1.0 + rs)) #calculating relative strength index
    return rsi

df['RSI'] = relative_strength_idx(df).fillna(0) #filling nan values with 0

#plotting scatterplot for relative strength index
fig = go.Figure(go.Scatter(x=df.Date, y=df.RSI, name='RSI'))
fig.show()

In [None]:
#MACD - moving average convergence diverge - shows relationship between 2 moving averages
EMA_12 = pd.Series(df['close'].ewm(span=12, min_periods=12).mean())
EMA_26 = pd.Series(df['close'].ewm(span=26, min_periods=26).mean())
df['MACD'] = pd.Series(EMA_12 - EMA_26)
df['MACD_signal'] = pd.Series(df.MACD.ewm(span=9, min_periods=9).mean())

fig = make_subplots(rows=2, cols=1)
fig.add_trace(go.Scatter(x=df.Date, y=df.close, name='close'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=EMA_12, name='EMA 12'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=EMA_26, name='EMA 26'), row=1, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=df['MACD'], name='MACD'), row=2, col=1)
fig.add_trace(go.Scatter(x=df.Date, y=df['MACD_signal'], name='Signal line'), row=2, col=1)
fig.show()

In [None]:
#Shift label column

# Because to predict the next day price, after calculating all features for day Di
# , shift Close price column by -1 rows. After doing that, for day Di we have features from the same timestamp e.g. RSIi, but the price Ci+1 from upcoming day.
df['Close'] = df['close'].shift(-1)

In [None]:
#Drop invalid samples

# Because of calculating moving averages and shifting label column, few rows will have invalid values i.e. we haven't calculated SMA10
# for the first 10 days. Moreover, after shifting Close price column, last row price is equal to 0 which is not true. Removing these samples should help.

df = df.iloc[33:] # Because of moving averages and MACD line
df = df[:-1]      # Because of shifting close price

df.index = range(len(df))

In [None]:
df.head()

In [None]:
#calculating price shift
df['price_shift'] = df['close'].shift(periods=1)

In [None]:
df.head()

In [None]:
ls = ["EMA_9","SMA_5","SMA_10","SMA_15","SMA_30","RSI","MACD","MACD_signal"]

In [None]:
#shifting all above listed column rows by 1 
for i in ls:
    for j in range(1,30):
        df[i+"_"+str(j)] = df[i].shift(periods=1)

In [None]:
df = df[30:]

In [None]:
df.head()

In [None]:
df.columns.tolist()

In [None]:
#spliting to train and test and validate
test_size  = 0.15
valid_size = 0.15

test_split_idx  = int(df.shape[0] * (1-test_size))
valid_split_idx = int(df.shape[0] * (1-(valid_size+test_size)))

train_df  = df.loc[:valid_split_idx].copy()
valid_df  = df.loc[valid_split_idx+1:test_split_idx].copy()
test_df   = df.loc[test_split_idx+1:].copy()

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_df.Date, y=train_df.Close, name='Training'))
fig.add_trace(go.Scatter(x=valid_df.Date, y=valid_df.Close, name='Validation'))
fig.add_trace(go.Scatter(x=test_df.Date,  y=test_df.Close,  name='Test'))
fig.show()

In [None]:
#dropping below listed columns from train test and validate columns
drop_cols = ['close','open','high','volume','Close','low','price_shift','Date']

train_df = train_df.drop(drop_cols, 1)
valid_df = valid_df.drop(drop_cols, 1)
test_df  = test_df.drop(drop_cols, 1)



In [None]:
train_df.head()

In [None]:
#getting x_train y_train x_test y_test x_valid y_valid
y_train = train_df['target'].copy()
X_train = train_df.drop(['target'], 1)

y_valid = valid_df['target'].copy()
X_valid = valid_df.drop(['target'], 1)

y_test  = test_df['target'].copy()
X_test  = test_df.drop(['target'], 1)

X_train.info()

In [None]:
#fine tunning

parameters = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.001, 0.005, 0.01, 0.05],
    'max_depth': [8, 10, 12, 15],
    'gamma': [0.001, 0.005, 0.01, 0.02],
    'random_state': [42]
}
#defining model
eval_set = [(X_train, y_train), (X_valid, y_valid)]
model = xgb.XGBClassifier(eval_set=eval_set, objective='binary:logistic', verbose=False)

#finding best parameters using grid search
clf = GridSearchCV(model, parameters)
#fitting model with train test data
clf.fit(X_train, y_train)

print(f'Best params: {clf.best_params_}')
print(f'Best validation score = {clf.best_score_}')

In [None]:
#defining model with best parameters using grid search 
model = xgb.XGBClassifier(**clf.best_params_, objective='binary:logistic')
model.fit(X_train, y_train, eval_set=eval_set, verbose=False)

In [None]:


plot_importance(model);



In [None]:

#predecting values for x_test
y_pred = model.predict(X_test)
print(f'y_true = {np.array(y_test)[:5]}')
print(f'y_pred = {y_pred[:5]}')



In [None]:

from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier


In [None]:


# Find accuracy using the test set
y_pred = model.predict(X_test)
print('Accuracy: {}'.format(accuracy_score(y_pred, y_test)))



In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, labels=[0, 1]))

In [None]:

# Create a classifier and fit the data
clf = AdaBoostClassifier(random_state=1)
clf.fit(X_train, y_train)



In [None]:


# Find accuracy using the test set
y_pred = clf.predict(X_test)
print('Accuracy: {}'.format(accuracy_score(y_pred, y_test)))



In [None]:
#printing classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, labels=[0, 1]))

In [None]:
print(f'mean_squared_error = {mean_squared_error(y_test, y_pred)}')