# Predict market movement using Support Vector Machine

Disclaimer
DO NOT use this as your investment strategy! 

In [40]:
import numpy as np 
import os
from pandas_datareader import data, wb # Datareader library to read external data
import pandas as pd
import plotly.offline as py # Plotly, interactive plots library 
import plotly.graph_objs as go
from sklearn import svm # Scikit-learn, machine learning library in Python
from sklearn.externals import joblib
import warnings

warnings.filterwarnings('ignore')

py.init_notebook_mode(connected=True)

# Api key to connect to Quandl, comes with free account
os.environ["QUANDL_API_KEY"] = 'Hke1fVG4oAyCbYqHTAxd' 

## Data collection and inspection

In [49]:
# Define the instruments to download. We would like to see Apple index.
tickers = ['AAPL']

# We would like all available data from 01/01/2011 until 12/31/2016.
start_date = '2011-01-01'
end_date = '2016-12-31'

# User pandas_reader.data.DataReader to load the desired data. As simple as that.
data = data.DataReader(tickers, 'quandl', start_date, end_date)

In [51]:
data.head()

Attributes,Open,High,Low,Close,Volume,ExDividend,SplitRatio,AdjOpen,AdjHigh,AdjLow,AdjClose,AdjVolume
Symbols,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2016-12-30,116.65,117.2,115.43,115.82,30586265.0,0.0,1.0,115.209202,115.752409,114.004271,114.389454,30586265.0
2016-12-29,116.45,117.1095,116.4,116.73,15039519.0,0.0,1.0,115.011672,115.663027,114.96229,115.288214,15039519.0
2016-12-28,117.52,118.0166,116.2,116.76,20905892.0,0.0,1.0,116.068456,116.558923,114.76476,115.317843,20905892.0
2016-12-27,116.52,117.8,116.49,117.26,18296855.0,0.0,1.0,115.080808,116.344998,115.051178,115.811668,18296855.0
2016-12-23,115.59,116.52,115.59,116.52,14249484.0,0.0,1.0,114.162295,115.080808,114.162295,115.080808,14249484.0


## Train the model and predict

In [52]:
# Split the dataset into train and validate sets
train_set = data.iloc[:1200]
out_sample_test = data.iloc[1200:]

In [32]:
def svm_train(train, model_name = 'AAPL_prices', lags=14):
    
    # Log returns
    data = pd.DataFrame(train['AdjClose'])
    data['returns'] = np.log(data / data.shift(1))
    
    cols = []
    # Create a dataframe containing lagged returns up to 14 periods
    for lag in range(1, lags + 1):
        col = 'lag_%s' % lag
        data[col] = data['returns'].shift(lag)
        cols.append(col)
        
    # Direction of the returns, categorized as (1, -1)
    data['direction'] = np.sign(data['returns'])
    data.dropna(inplace=True)
    
    # Define Support vector machine model with appropriate settings
    model = svm.SVC(C=100, probability=True, verbose=True)
    # Fit the model, signs of the lagged returns as input, signs of the returns as output
    model.fit(np.sign(data[cols]), np.sign(data['returns']))
    
    # Save model
    filename = '%s.sav'%model_name
    joblib.dump(model, filename)
    
    return model

def svm_predict(model, validate, lags=14):
    
    data = pd.DataFrame(validate['AdjClose'])
    data['returns'] = np.log(data / data.shift(1))
    
    cols = []
    # Create a dataframe containing lagged returns up to 14 periods
    for lag in range(1, lags + 1):
        col = 'lag_%s' % lag
        data[col] = data['returns'].shift(lag)
        cols.append(col)
        
    data['direction'] = np.sign(data['returns'])
    data.dropna(inplace=True)
    
    # Use the validation data to make predictions
    prediction = model.predict(np.sign(data[cols]))
    
    data['position'] = prediction
    data['strategy'] = data['position'] * data['returns']
        
    return data
       

In [53]:
# The magic (or blackbox)
model_3 = svm_train(train_set)
results = svm_predict(model_3, out_sample_test)
results = results.sort_index(ascending=True)

[LibSVM]

In [47]:
strategy_charts = go.Scatter(x=results.index, y=results['strategy'].cumsum(), name="Strategy AAPL")
market_charts = go.Scatter(x=results.index, y=results['returns'].cumsum(), name="Market returns AAPL")

py.iplot([strategy_charts, market_charts])

## Summary
The model looks quite decent. It shows that it can predict the directions of the movement and yields returns better than the market returns. That being said, it is only a theoretical approach. We still need to validate the model and take broker fee's, slippage and all the other 157 factors into account, in order to determine if the strategy is profitable. 