## Stock Classification
predict whether the stock price will go up(+1) or go down(-1).



- 3 stock to predict: Apple Inc.(AAPL), JP Morgan Chase & Co.(JPM), Pfizer Inc.(PFE)
- Predictive Model: Linear SVM, RBF SVM, Linear Regression, Logistic Regression, K-NN, Random Forest, Ensemble learning using Bagging Classifier with Decision Tree

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [2]:
def stock_hist(df,ticker):
    stock = df[df['Ticker']== ticker]
    stock = stock.reset_index()
    stock = stock.drop(columns = "index")
    return stock

In [3]:
def train_test_split(df):
    X = df[['Open','High','Low']]
    y = np.where(df['Close'].shift(-1) > df['Close'], 1, -1)

    df_size = df.shape[0]
    n = int(df_size * 0.8)
    X_train, X_test = X[:n], X[n:]
    y_train, y_test = y[:n], y[n:]
    
    return X_train, y_train, X_test, y_test

In [4]:
def prediction(x_train, y_train, x_test, y_test):
    
    svm_rbf_classifier = SVC(kernel='rbf',gamma='scale')
    svm_rbf_model = svm_rbf_classifier.fit(x_train, y_train)

    lr_model=LinearRegression()
    lr_model.fit(x_train,y_train)

    logreg_model=LogisticRegression()
    logreg_model.fit(x_train,y_train)

    knn_model=KNeighborsClassifier()
    knn_model.fit(x_train,y_train)

    rf_model = RandomForestClassifier(random_state=5)
    rf_model.fit(x_train, y_train)

    dt = DecisionTreeClassifier()
    el_model = BaggingClassifier(base_estimator=dt, n_estimators=100, random_state=7)
    train_results = cross_val_score(el_model, x_train, y_train, cv=5)
    test_results = cross_val_score(el_model, x_test, y_test, cv=5)

    # calculate error
    svm_rbf_error = svm_rbf_model.score(x_train, y_train)
    lr_error = lr_model.score(x_train, y_train)
    logreg_error = logreg_model.score(x_train, y_train)
    knn_error = knn_model.score(x_train, y_train)
    rf_train_accuracy = rf_model.score(x_train, y_train)

    svm_rbf_test_error = svm_rbf_model.score(x_test, y_test)
    lr_test_error = lr_model.score(x_test, y_test)
    logreg_test_error = logreg_model.score(x_test, y_test)
    knn_test_error = knn_model.score(x_test, y_test)
    rf_test_accuracy = rf_model.score(x_test, y_test)

    error = [[1-svm_rbf_error, 1-lr_error, 1-logreg_error, 1-knn_error, 1-rf_train_accuracy, 1-train_results.mean()],
             [1-svm_rbf_test_error, 1-lr_test_error, 1-logreg_test_error, 1-knn_test_error, 1-rf_test_accuracy, 1-test_results.mean()]]
    cols = ['RBF SVM','LinReg','LogReg','K-NN','RandForest','EnsembleL']
    index = ['Train Error','Test Error']
    error_df = pd.DataFrame(error, columns = cols, index = index)
    return error_df

#### Load S&P 500 data

In [5]:
df = pd.read_csv("sp500_all_5y.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

### 1. Apple Inc. stock (Ticker: AAPL)

In [6]:
apple = stock_hist(df,"AAPL")
apple

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume
0,AAPL,2015-11-16,25.588095,26.245143,25.500796,26.231359,152426800.0
1,AAPL,2015-11-17,26.401363,26.431230,26.033784,26.118788,110467600.0
2,AAPL,2015-11-18,26.594344,26.991788,26.534612,26.945841,186698800.0
3,AAPL,2015-11-19,27.026249,27.510994,26.824081,27.288149,173183200.0
4,AAPL,2015-11-20,27.384641,27.550051,27.304233,27.407616,137148400.0
...,...,...,...,...,...,...,...
1254,AAPL,2020-11-09,120.500000,121.989998,116.050003,116.320000,154515300.0
1255,AAPL,2020-11-10,115.550003,117.589996,114.129997,115.970001,138023400.0
1256,AAPL,2020-11-11,117.190002,119.629997,116.440002,119.489998,112295000.0
1257,AAPL,2020-11-12,119.620003,120.529999,118.570000,119.209999,103162300.0


In [7]:
X_train, y_train, X_test, y_test = train_test_split(apple)

In [8]:
error_table1 = prediction(X_train, y_train, X_test, y_test)
error_table1



Unnamed: 0,RBF SVM,LinReg,LogReg,K-NN,RandForest,EnsembleL
Train Error,0.463754,0.996503,0.453823,0.298908,0.02284,0.527245
Test Error,0.452381,1.053467,0.472222,0.460317,0.452381,0.501119


### 2. JP Morgan Chase & Co. (Ticker: JPM)

In [9]:
jpm = stock_hist(df,"JPM")
jpm

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume
0,JPM,2015-11-16,56.543356,57.510797,56.292858,57.450333,11041100.0
1,JPM,2015-11-17,57.588538,57.649003,56.914787,57.122093,12354800.0
2,JPM,2015-11-18,57.424418,58.348667,57.329401,58.262283,12911400.0
3,JPM,2015-11-19,58.374583,58.581890,57.908138,58.443687,12839500.0
4,JPM,2015-11-20,58.728727,58.832384,58.201818,58.340027,11209600.0
...,...,...,...,...,...,...,...
1254,JPM,2020-11-09,113.160004,118.900002,110.349998,116.900002,47792400.0
1255,JPM,2020-11-10,117.349998,117.449997,114.690002,116.519997,20539300.0
1256,JPM,2020-11-11,116.889999,116.959999,113.959999,114.779999,14443400.0
1257,JPM,2020-11-12,112.529999,113.959999,112.059998,113.370003,22063000.0


In [10]:
X_train, y_train, X_test, y_test = train_test_split(jpm)

In [11]:
error_table2 = prediction(X_train, y_train, X_test, y_test)
error_table2



Unnamed: 0,RBF SVM,LinReg,LogReg,K-NN,RandForest,EnsembleL
Train Error,0.486594,0.99962,0.486594,0.295929,0.034757,0.513364
Test Error,0.503968,1.000779,0.503968,0.511905,0.52381,0.460078


### 3. Pfizer Inc. stock (Ticker: PFE)

In [12]:
pfe = stock_hist(df,"PFE")
pfe

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume
0,PFE,2015-11-16,27.603836,27.703339,27.238989,27.496040,25637100.0
1,PFE,2015-11-17,27.529212,27.744801,27.255577,27.255577,29692200.0
2,PFE,2015-11-18,27.429705,27.719921,27.305324,27.620419,35963300.0
3,PFE,2015-11-19,27.189238,27.338492,26.492715,26.774641,90712700.0
4,PFE,2015-11-20,27.015110,27.023401,26.600514,26.683432,64599600.0
...,...,...,...,...,...,...,...
1254,PFE,2020-11-09,41.860001,41.990002,38.380001,39.200001,218362300.0
1255,PFE,2020-11-10,40.450001,40.540001,38.459999,38.680000,75988300.0
1256,PFE,2020-11-11,38.880001,40.200001,37.900002,38.500000,55959200.0
1257,PFE,2020-11-12,38.279999,38.340000,37.240002,37.549999,44371800.0


In [13]:
X_train, y_train, X_test, y_test = train_test_split(pfe)

In [14]:
error_table3 = prediction(X_train, y_train, X_test, y_test)
error_table3



Unnamed: 0,RBF SVM,LinReg,LogReg,K-NN,RandForest,EnsembleL
Train Error,0.46574,0.998834,0.480636,0.318769,0.030785,0.501527
Test Error,0.559524,1.024887,0.555556,0.571429,0.472222,0.547854


Reference:    
https://blog.quantinsti.com/random-forest-algorithm-in-python/    
https://www.datacamp.com/community/tutorials/ensemble-learning-python