# BSE-SENSEX - Bombay Stock Exchange

1. Import Dataset
2. Generate 10 technical indicators
3. Feature Engineering: Scale data, split data into 50% training - 50% hold-out
4. Apply SVM models and evaluate accurate rate

In [2]:
import talib as ta
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import itertools

import warnings
warnings.filterwarnings('ignore')

In [3]:
def min_max_scaling(column):
    min_value = column.min()
    max_value = column.max()
    scaled_column = 2 * (column - min_value) / (max_value - min_value) - 1
    return scaled_column

In [4]:
def technical_indicators_2(df):
    Open = df.open
    high = df.high
    low =  df.low
    close = df.close
    df['1SMA'] =  ta.SMA(close, 10)   
    df['1WMA'] =  ta.WMA(close, 10)  
    df['2CCI'] =  ta.CCI(high, low, close, timeperiod = 10)
    df['2MACD'],_,_ = ta.MACD(close)           
    df['2MOM'] =  ta.MOM(close, timeperiod = 10)
    df['2SLOWK'], df['2SLOWD'] = ta.STOCHF(high, low, close, fastk_period = 10, fastd_period = 1 )
    df['2RSI'] =  ta.RSI(close,9)
    df['2WILLR'] = ta.WILLR(high, low, close, timeperiod = 10)
    return(df)

## 1. Import dataset

In [18]:
filepath = './'
data = pd.read_csv(filepath + 'SENSEX.csv', index_col = False)
data.columns = [x.lower().replace(' ','') for x in data.columns]
data['year'] = data.date.str[-4:].astype(int)
#data.drop('date', axis = 1, inplace = True)
data.dropna(axis  = 0, inplace = True)
data['direction'] = np.where(data['close'].shift(-1) > data['close'], 1, 0)

In [19]:
print('- Number of trading days: ',len(data))
print('- Number of increase days: ', data.direction.sum(), f' equivalent to {data.direction.sum()/len(data):.2f} of dataset')

- Number of trading days:  2497
- Number of increase days:  1356  equivalent to 0.54 of dataset


In [20]:
data.head()

Unnamed: 0,date,open,high,low,close,year,direction
0,1-January-2003,3383.85,3396.8,3381.07,3390.12,2003,0
1,2-January-2003,3399.72,3407.24,3363.11,3365.06,2003,0
2,3-January-2003,3380.44,3386.97,3351.89,3357.54,2003,0
3,6-January-2003,3364.72,3364.72,3330.02,3334.89,2003,0
4,7-January-2003,3350.18,3352.43,3320.42,3330.5,2003,1


## 2. Generate technical indicators

In [21]:
## Generate technical indicators
df = technical_indicators_2(data.copy())
df['4ADOSC'] = (df.high - df.close.shift(1))/(df.high - df.low)
df['2SLOWD'] = df['2SLOWK'].rolling(10).mean()

## Prepare model data
cols = ['1SMA', '1WMA', '2MOM', '2SLOWK', '2SLOWD', '2RSI', '2MACD', '2WILLR', '2CCI', '4ADOSC']
df.dropna(axis  = 0, inplace = True)
df['direction'] = np.where(df['close'].shift(-1) > df['close'], 1, 0)

In [39]:
df[cols].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1SMA,2464.0,12622.403948,5268.350615,2957.107,7302.92375,14054.776,17253.65375,20649.144
1WMA,2464.0,12632.205648,5267.129094,2952.826727,7320.908545,14056.534,17264.565,20751.636909
2MOM,2464.0,65.331997,674.283428,-4143.39,-200.475,123.38,419.8425,2451.83
2SLOWK,2464.0,60.075804,31.909306,1.100746,30.644378,68.371267,89.248858,100.0
2SLOWD,2464.0,60.062236,23.279765,9.107829,41.140346,63.258869,81.495061,96.424305
2RSI,2464.0,55.831556,16.439297,11.294137,43.473603,56.9202,68.893313,93.820389
2MACD,2464.0,44.868492,252.597712,-1146.294812,-62.395078,78.949022,181.347403,921.159745
2WILLR,2464.0,-39.924196,31.909306,-98.899254,-69.355622,-31.628733,-10.751142,-0.0
2CCI,2464.0,23.126473,104.332764,-247.486551,-69.494101,48.762908,107.563216,333.333333
4ADOSC,2464.0,0.603373,0.535121,-1.828686,0.251579,0.57851,0.96584,3.707986


## 3. Feature Engineering: 
* Scale data to range [-1,1]
* Split 50-50 train-test data for each year and remain the original proportion of incease/decrease directions 

In [24]:
### Normalize and Split data 
cols = ['1SMA', '1WMA', '2MOM', '2SLOWK', '2SLOWD', '2RSI', '2MACD', '2WILLR', '2CCI', '4ADOSC'] 
# add 'year', 'direction'
cols_full = ['year', 'direction', '1SMA', '1WMA', '2MOM', '2SLOWK', '2SLOWD', '2RSI', '2MACD', '2WILLR', '2CCI', '4ADOSC'] 
X = df[cols_full]
## Normalize
X[cols] = X[cols].apply(min_max_scaling)

## Split
X_train = pd.DataFrame(columns = cols)
X_test = pd.DataFrame(columns = cols)
y_train, y_test = pd.Series(), pd.Series()
for year in (X.year.unique()):
    Xyear = X[X.year == year]
    X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(Xyear[cols], Xyear.direction
                                                        , train_size=0.5, stratify=Xyear.direction, random_state=42)
    X_train = pd.concat([X_train, X_train_0])    
    X_test = pd.concat([X_test, X_test_0])    
    y_train = pd.concat([y_train, y_train_0])
    y_test = pd.concat([y_test, y_test_0])

In [32]:
X_train.head()

Unnamed: 0,1SMA,1WMA,2MOM,2SLOWK,2SLOWD,2RSI,2MACD,2WILLR,2CCI,4ADOSC
189,-0.847275,-0.841391,0.323116,0.834014,0.156747,0.300636,0.185239,0.834014,0.224073,-0.113761
200,-0.795387,-0.790101,0.370993,0.904707,0.907311,0.677564,0.278585,0.904707,0.247408,0.130002
108,-0.971807,-0.967327,0.329451,0.896444,0.887702,0.810696,0.175762,0.896444,0.367588,-0.030245
146,-0.918044,-0.913583,0.293995,0.710828,0.101103,0.482025,0.180321,0.710828,0.224789,0.055333
34,-0.964582,-0.964111,0.263733,0.679465,0.128233,0.078367,0.088265,0.679465,0.292553,0.272422


In [33]:
y_train

189     1
200     0
108     0
146     1
34      1
       ..
2320    1
2428    0
2258    0
2349    0
2249    1
Length: 1229, dtype: int64

In [28]:
print('- Number of increase days in train: ', y_train.sum(), f', equivalent to {y_train.sum()/len(y_train):.2f} of dataset')

- Number of increase days in train:  671 , equivalent to 0.55 of dataset


In [27]:
print('- Number of increase days in test: ', y_test.sum(), f', equivalent to {y_test.sum()/len(y_test):.2f} of dataset')

- Number of increase days in test:  671 , equivalent to 0.54 of dataset


## 4. Train model

## SVM Poly kernel

In [36]:
### Model
test_acc_scores, train_acc_scores = [],[]
C = (0.1,1,5,10,100)
degree = (1,2,3,4)
for c in C:
    for d in degree:
        svc = SVC(kernel = 'poly', C = c, degree = d)
        cls = svc.fit(X_train, y_train)
        y_pred = cls.predict(X_test)
        test_acc_scores.append(accuracy_score(y_test, y_pred))
        y_pred = cls.predict(X_train)
        train_acc_scores.append(accuracy_score(y_train, y_pred))

In [37]:
combinations = list(itertools.product(C, degree))
pd.DataFrame({'Parameters C,d':combinations, 'Test Accucary':test_acc_scores, 'Train Accuracy':train_acc_scores})

Unnamed: 0,"Parameters C,d",Test Accucary,Train Accuracy
0,"(0.1, 1)",0.54332,0.545972
1,"(0.1, 2)",0.54332,0.545972
2,"(0.1, 3)",0.545749,0.551668
3,"(0.1, 4)",0.547368,0.557364
4,"(1, 1)",0.54332,0.545972
5,"(1, 2)",0.54332,0.548413
6,"(1, 3)",0.546559,0.569569
7,"(1, 4)",0.548178,0.575264
8,"(5, 1)",0.54332,0.545972
9,"(5, 2)",0.552227,0.57201


In [38]:
print('Test accuracy: ',max(test_acc_scores))

Test accuracy:  0.5522267206477732


## SVM RBF kernel

In [31]:
C = (0.1,1,5,10,100)
gammas = np.arange(0.5,10.1,0.5)
test_acc_scores, train_acc_scores = [],[]
for c in C:
    for gamma in gammas:
        svc = SVC(kernel = 'rbf', C = c, gamma = gamma)
        cls = svc.fit(X_train, y_train)
        y_pred = cls.predict(X_test)
        test_acc_scores.append(accuracy_score(y_test, y_pred))
        
        y_pred = cls.predict(X_train)
        train_acc_scores.append(accuracy_score(y_train, y_pred))

In [34]:
combinations = list(itertools.product(C, gammas))
pd.DataFrame({'Parameters C,gamma':combinations, 'Test Accucary':test_acc_scores, 'Train Accuracy':train_acc_scores})

Unnamed: 0,"Parameters C,gamma",Test Accucary,Train Accuracy
0,"(0.1, 0.5)",0.543320,0.545972
1,"(0.1, 1.0)",0.543320,0.545972
2,"(0.1, 1.5)",0.542510,0.547600
3,"(0.1, 2.0)",0.542510,0.546786
4,"(0.1, 2.5)",0.543320,0.545972
...,...,...,...
95,"(100, 8.0)",0.510121,1.000000
96,"(100, 8.5)",0.516599,1.000000
97,"(100, 9.0)",0.513360,1.000000
98,"(100, 9.5)",0.514980,1.000000


In [35]:
print('Test accuracy: ',max(test_acc_scores))

Test accuracy:  0.5522267206477732
