### Applying different models on SPX data - supervised learning 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn. ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn import metrics

In [3]:
df = pd.read_csv('data/spx.csv')
df.set_index('Date', inplace=True)
df['Direction_num'] = df.Direction.map({'D':0, 'U':1})
df.fillna(0, inplace=True)
df.tail()

Unnamed: 0_level_0,SPX,Unemp Claims,Ret Sales,Ind Prod,NFP,Direction,Direction_num
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10/30/2017,2587.840088,229000,432041,106.145,147013,U,1.0
11/6/2017,2582.300049,239000,435470,106.3969,147241,D,0.0
11/13/2017,2578.850098,252000,435470,106.3969,147241,D,0.0
11/20/2017,2602.419922,240000,435470,106.3969,147241,U,1.0
11/27/2017,2642.219971,238000,435470,106.3969,147241,U,1.0


In [4]:
# X = df[['SPX','Unemp Claims', 'Ret Sales', 'Ind Prod', 'NFP']]
X = df[['SPX','Ind Prod','NFP']]
y = df['Direction_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)

### Multinominal Naive Bayes
***Added y_pred_class for prediction***

In [59]:
mnb = MultinomialNB()
mnb.fit (X_train, y_train)
y_pred_class = mnb.predict (X_test)
print ('Training data accuracy = {:.2f}%'.format(metrics.accuracy_score (y_test, y_pred_class)*100))
print ('Test data accuracy = {:.2f}%'.format(mnb.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(mnb.score (X_train, y_train)*100))

Training data accuracy = 50.38%
Test data accuracy = 50.38%
Training data accuracy = 53.20%


### Gaussian Naive Bayes

In [60]:
gnb = MultinomialNB()
gnb.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(gnb.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(gnb.score (X_train, y_train)*100))

Test data accuracy = 50.38%
Training data accuracy = 53.20%


### Support Vector Machine

In [62]:
svc = SVC()
svc.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(svc.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(svc.score (X_train, y_train)*100))

Test data accuracy = 60.31%
Training data accuracy = 98.72%


### Random Forrest

In [63]:
rand_forrest = RandomForestClassifier()
rand_forrest.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(rand_forrest.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(rand_forrest.score (X_train, y_train)*100))

Test data accuracy = 54.20%
Training data accuracy = 96.16%


### Logistic Regression

In [64]:
logreg = LogisticRegression()
logreg.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(logreg.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(logreg.score (X_train, y_train)*100))

Test data accuracy = 60.31%
Training data accuracy = 56.78%


### KNeighborsClassifier

In [50]:
knn = KNeighborsClassifier()
knn.fit (X_train, y_train)
y_pred_class = knn.predict (X_test)
print ('Test data accuracy = {:.2f}%'.format(knn.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(knn.score (X_train, y_train)*100))

Test data accuracy = 51.15%
Training data accuracy = 68.54%


### Decision Tree

In [65]:
tree = DecisionTreeClassifier()
tree.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(tree.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(tree.score (X_train, y_train)*100))

Test data accuracy = 51.91%
Training data accuracy = 100.00%


### Gradient Boosting

In [66]:
gboost = GradientBoostingClassifier()
gboost.fit (X_train, y_train)
print ('Test data accuracy = {:.2f}%'.format(gboost.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(gboost.score (X_train, y_train)*100))

Test data accuracy = 54.96%
Training data accuracy = 84.91%


### Bagging

In [67]:
bg = BaggingClassifier(DecisionTreeClassifier(), max_samples= 0.5, max_features = 1.0, n_estimators = 20)
bg.fit(X_train,y_train)
print ('Test data accuracy = {:.2f}%'.format(bg.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(bg.score (X_train, y_train)*100))

Test data accuracy = 56.49%
Training data accuracy = 91.82%


### Boosting - ADA Boost

In [5]:
adb = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 5, learning_rate = 1)
adb.fit(X_train,y_train)
print ('Test data accuracy = {:.2f}%'.format(adb.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(adb.score (X_train, y_train)*100))

Test data accuracy = 52.67%
Training data accuracy = 100.00%


### Voting Classifier - Multiple Model Ensemble

In [None]:
lr = LogisticRegression()
dt = DecisionTreeClassifier()
svm = SVC(kernel = 'poly', degree = 2 )
evc = VotingClassifier( estimators= [('lr',lr),('dt',dt),('svm',svm)], voting = 'hard')
evc.fit(X_train,y_train)
print ('Test data accuracy = {:.2f}%'.format(evc.score (X_test, y_test)*100))
print ('Training data accuracy = {:.2f}%'.format(evc.score (X_train, y_train)*100))