### Applying different models on SPX data - supervised learning 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn. ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import SelectPercentile
from sklearn import metrics

### SPX Weekly 

In [2]:
df = pd.read_csv('data/spx_weekly.csv')
df.set_index('Date', inplace=True)
df['Direction_num'] = df.Direction.map({'D':0, 'U':1})
df.fillna(0, inplace=True)
# X = df[['Range', 'Volume', 'Unemp Claims', 'Ret Sales', 'Ind Prod', 'NFP']]
X = df[['Range', 'Volume', 'Ret Sales']]
y = df['Direction_num']
df.tail()

Unnamed: 0_level_0,Adj Close,Open,High,Low,Range,Volume,Unemp Claims,Ret Sales,Ind Prod,NFP,Direction,Direction_num
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
12/11/2017,2675.810059,2652.189941,2679.629883,2651.469971,28.159912,19343950000,0.0,0.0,0.0,0.0,U,1.0
12/18/2017,2683.340088,2685.919922,2694.969971,2676.110107,18.859864,16007500000,0.0,0.0,0.0,0.0,D,0.0
12/25/2017,2673.610107,2679.090088,2692.120117,2673.610107,18.51001,8767680000,0.0,0.0,0.0,0.0,U,1.0
1/1/2018,2743.149902,2683.72998,2743.449951,2682.360107,61.089844,13827790000,0.0,0.0,0.0,0.0,0,0.0
1/5/2018,2743.149902,2731.330078,2743.449951,2727.919922,15.530029,1998927821,0.0,0.0,0.0,0.0,0,0.0


### SPX Daily

In [3]:
pd.options.mode.chained_assignment = None
df = pd.read_csv('data/spx_daily.csv')
df.set_index('Date', inplace=True)
# df_red = df.iloc[12330:]
df_red = df.iloc[14465:]
df_red['Direction_num'] = df_red.Direction.map({'D':0, 'U':1})
df_red.fillna(0, inplace=True)
X = df_red[['Open', 'High', 'Low', 'Close', 'Volume', 'Range', 'MA21', 'MA50', 'Engulf']]
y = df_red['Direction_num']
df_red.head()
# df_red.describe()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Range,MA21,MA50,Engulf,Direction,Direction_num
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
7/2/2007,1504.660034,1519.449951,1504.660034,1519.430054,2648990000,14.789917,1513.684286,1509.923398,0.0,U,1
7/3/2007,1519.119995,1526.01001,1519.119995,1524.869995,1560790000,6.890015,1513.002854,1510.802197,0.0,U,1
7/5/2007,1524.859985,1526.569946,1517.719971,1525.400024,2622950000,8.849975,1512.738572,1511.701997,1.0,U,1
7/6/2007,1524.959961,1532.400024,1520.469971,1530.439941,2441520000,11.930053,1513.360474,1512.402395,0.0,U,1
7/9/2007,1530.430054,1534.26001,1527.449951,1531.849976,2715330000,6.810059,1515.319045,1513.154395,0.0,D,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)
# print(y_train.shape)
# print(y_test.shape)

### Dimensionality Reduction - 1

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)

tree = DecisionTreeClassifier()
tree.fit (X_train, y_train)
print ('tree - Test data accuracy = {:.2f}%'.format(tree.score (X_test, y_test)*100))
print ('tree - Training data accuracy = {:.2f}%'.format(tree.score (X_train, y_train)*100))
print (X_train.shape)

pca = PCA(n_components=2, whiten='True')
X_dimred = pca.fit(X).transform(X)
pca.explained_variance_

X_train, X_test, y_train, y_test = train_test_split(X_dimred, y, test_size=0.33, random_state=10)

tree = DecisionTreeClassifier()
tree.fit (X_train, y_train)
print ('tree - Test data accuracy = {:.2f}%'.format(tree.score (X_test, y_test)*100))
print ('tree - Training data accuracy = {:.2f}%'.format(tree.score (X_train, y_train)*100))
print (X_train.shape)

tree - Test data accuracy = 49.83%
tree - Training data accuracy = 100.00%
(1774, 9)
tree - Test data accuracy = 50.86%
tree - Training data accuracy = 100.00%
(1774, 2)


### Dimensionality Reduction - 2

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)

tree = DecisionTreeClassifier()
tree.fit (X_train, y_train)
print ('tree - Test data accuracy = {:.2f}%'.format(tree.score (X_test, y_test)*100))
print ('tree - Training data accuracy = {:.2f}%'.format(tree.score (X_train, y_train)*100))
print (X_train.shape)

select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

# X_train, X_test, y_train, y_test = train_test_split(X_dimred, y, test_size=0.33, random_state=10)

tree = DecisionTreeClassifier()
tree.fit (X_train_selected, y_train)
print ('tree - Test data accuracy = {:.2f}%'.format(tree.score (X_test_selected, y_test)*100))
print ('tree - Training data accuracy = {:.2f}%'.format(tree.score (X_train, y_train)*100))

tree - Test data accuracy = 50.97%
tree - Training data accuracy = 100.00%
(1774, 9)
tree - Test data accuracy = 48.91%


ValueError: Number of features of the model must match the input. Model n_features is 4 and input n_features is 9 

### All Models

In [7]:
mnb = MultinomialNB()
mnb.fit (X_train, y_train)
print ('MNB - Test data accuracy = {:.2f}%'.format(mnb.score (X_test, y_test)*100))
print ('MNB - Training data accuracy = {:.2f}%'.format(mnb.score (X_train, y_train)*100))
print ('-----------------------------')
svc = SVC()
svc.fit (X_train, y_train)
print ('SVC - Test data accuracy = {:.2f}%'.format(svc.score (X_test, y_test)*100))
print ('SVC - Training data accuracy = {:.2f}%'.format(svc.score (X_train, y_train)*100))
print ('-----------------------------')
rand_forrest = RandomForestClassifier()
rand_forrest.fit (X_train, y_train)
print ('Rand_forerst - Test data accuracy = {:.2f}%'.format(rand_forrest.score (X_test, y_test)*100))
print ('Rand_forerst - Training data accuracy = {:.2f}%'.format(rand_forrest.score (X_train, y_train)*100))
print ('-----------------------------')
logreg = LogisticRegression()
logreg.fit (X_train, y_train)
print ('logreg - Test data accuracy = {:.2f}%'.format(logreg.score (X_test, y_test)*100))
print ('logreg - Training data accuracy = {:.2f}%'.format(logreg.score (X_train, y_train)*100))
print ('-----------------------------')
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit (X_train, y_train)
y_pred_class = knn.predict (X_test)
print ('knn - Test data accuracy = {:.2f}%'.format(knn.score (X_test, y_test)*100))
print ('knn - Training data accuracy = {:.2f}%'.format(knn.score (X_train, y_train)*100))
print ('-----------------------------')
tree = DecisionTreeClassifier()
tree.fit (X_train, y_train)
print ('tree - Test data accuracy = {:.2f}%'.format(tree.score (X_test, y_test)*100))
print ('tree - Training data accuracy = {:.2f}%'.format(tree.score (X_train, y_train)*100))
print ('-----------------------------')
gboost = GradientBoostingClassifier()
gboost.fit (X_train, y_train)
print ('gboost - Test data accuracy = {:.2f}%'.format(gboost.score (X_test, y_test)*100))
print ('gboost - Training data accuracy = {:.2f}%'.format(gboost.score (X_train, y_train)*100))
print ('-----------------------------')
# bg = BaggingClassifier(DecisionTreeClassifier(), max_samples= 0.5, max_features = 1.0, n_estimators = 20, oob_score=True)
bg = BaggingClassifier(svc, max_samples= 0.5, max_features = 1.0, n_estimators = 20, oob_score=True)
# bg = BaggingClassifier(knn, max_samples= 0.5, max_features = 2, n_jobs= 2, oob_score=True)
bg.fit(X_train,y_train)
print ('bg - Test data accuracy = {:.2f}%'.format(bg.score (X_test, y_test)*100))
print ('bg - Training data accuracy = {:.2f}%'.format(bg.score (X_train, y_train)*100))
print ('-----------------------------')
adb = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 5, learning_rate = 1)
adb.fit(X_train,y_train)
print ('adb - Test data accuracy = {:.2f}%'.format(adb.score (X_test, y_test)*100))
print ('adb - Training data accuracy = {:.2f}%'.format(adb.score (X_train, y_train)*100))
print ('-----------------------------')
mnb = MultinomialNB()
svm = SVC()
# svm = SVC(kernel = 'poly', degree = 2 )
rf = RandomForestClassifier()
lr = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
gb = GradientBoostingClassifier()
evc = VotingClassifier( estimators= [('svm',svm),('lr',lr)], voting = 'hard')
evc.fit(X_train,y_train)
print ('evc - Test data accuracy = {:.2f}%'.format(evc.score (X_test, y_test)*100))
print ('evc - Training data accuracy = {:.2f}%'.format(evc.score (X_train, y_train)*100))
print ('-----------------------------')
# pca = PCA(n_components=5, whiten=True)
# pca.fit(X_train, y_train)
# print ('pca - Test data accuracy = {:.2f}%'.format(pca.score (X_test, y_test)*100))
# print ('pca - Training data accuracy = {:.2f}%'.format(pca.score (X_train, y_train)*100))

MNB - Test data accuracy = 46.51%
MNB - Training data accuracy = 51.13%
-----------------------------
SVC - Test data accuracy = 54.51%
SVC - Training data accuracy = 99.72%
-----------------------------
Rand_forerst - Test data accuracy = 50.51%
Rand_forerst - Training data accuracy = 97.86%
-----------------------------
logreg - Test data accuracy = 54.51%
logreg - Training data accuracy = 54.00%
-----------------------------
knn - Test data accuracy = 51.77%
knn - Training data accuracy = 100.00%
-----------------------------
tree - Test data accuracy = 51.09%
tree - Training data accuracy = 100.00%
-----------------------------
gboost - Test data accuracy = 48.57%
gboost - Training data accuracy = 75.37%
-----------------------------
bg - Test data accuracy = 54.51%
bg - Training data accuracy = 63.42%
-----------------------------
adb - Test data accuracy = 51.20%
adb - Training data accuracy = 100.00%
-----------------------------


  if diff:


evc - Test data accuracy = 54.51%
evc - Training data accuracy = 99.72%
-----------------------------


  if diff:


### Multinominal Naive Bayes
***Added y_pred_class for prediction***

In [8]:
mnb = MultinomialNB()
mnb.fit (X_train, y_train)
y_pred_class = mnb.predict (X_test)
print ('Training data accuracy = {:.2f}%'.format(metrics.accuracy_score (y_test, y_pred_class)*100))
print ('Test data accuracy = {:.2f}%'.format(mnb.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(mnb.score (X_train, y_train)*100))

Training data accuracy = 46.51%
Test data accuracy = 46.51%
Training data accuracy = 51.13%


### Gaussian Naive Bayes

In [9]:
gnb = MultinomialNB()
gnb.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(gnb.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(gnb.score (X_train, y_train)*100))

Test data accuracy = 46.51%
Training data accuracy = 51.13%


### Support Vector Machine

In [10]:
svc = SVC()
svc.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(svc.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(svc.score (X_train, y_train)*100))

Test data accuracy = 54.51%
Training data accuracy = 99.72%


### Random Forrest

In [11]:
rand_forrest = RandomForestClassifier()
rand_forrest.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(rand_forrest.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(rand_forrest.score (X_train, y_train)*100))

Test data accuracy = 49.49%
Training data accuracy = 97.97%


### Logistic Regression

In [12]:
logreg = LogisticRegression()
logreg.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(logreg.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(logreg.score (X_train, y_train)*100))

Test data accuracy = 54.51%
Training data accuracy = 54.00%


### KNeighborsClassifier

In [13]:
knn = KNeighborsClassifier()
knn.fit (X_train, y_train)
y_pred_class = knn.predict (X_test)
print ('Test data accuracy = {:.2f}%'.format(knn.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(knn.score (X_train, y_train)*100))

Test data accuracy = 50.51%
Training data accuracy = 68.94%


### Decision Tree

In [14]:
tree = DecisionTreeClassifier()
tree.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(tree.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(tree.score (X_train, y_train)*100))

Test data accuracy = 52.34%
Training data accuracy = 100.00%


### Gradient Boosting

In [15]:
gboost = GradientBoostingClassifier()
gboost.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(gboost.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(gboost.score (X_train, y_train)*100))

Test data accuracy = 48.57%
Training data accuracy = 75.37%


### Bagging

In [16]:
bg = BaggingClassifier(DecisionTreeClassifier(), max_samples= 0.5, max_features = 1.0, n_estimators = 20)
bg.fit(X_train,y_train)
print ('Test data accuracy = {:.2f}%'.format(bg.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(bg.score (X_train, y_train)*100))

Test data accuracy = 49.71%
Training data accuracy = 94.08%


### Boosting - ADA Boost

In [17]:
adb = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 5, learning_rate = 1)
adb.fit(X_train,y_train)
print ('Test data accuracy = {:.2f}%'.format(adb.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(adb.score (X_train, y_train)*100))

Test data accuracy = 50.97%
Training data accuracy = 100.00%


### Voting Classifier - Multiple Model Ensemble

In [18]:
mnb = MultinomialNB()
svm = SVC()
# svm = SVC(kernel = 'poly', degree = 2 )
rf = RandomForestClassifier()
lr = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
gb = GradientBoostingClassifier()

evc = VotingClassifier( estimators= [('svm',svm),('lr',lr)], voting = 'hard')
evc.fit(X_train,y_train)
print ('Test data accuracy = {:.2f}%'.format(evc.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(evc.score (X_train, y_train)*100))

  if diff:


Test data accuracy = 54.51%
Training data accuracy = 99.72%


  if diff:
