In [1]:
# The following code backtests some possible "returns" when implementing a strategy using the 
# actual future data to generate signals. We use the convention CrystalBallStrategy to remind 
# us that these strategies are only possible if we had a crystal ball to tell the future. It 
# is our goal to find one that proves to make money and that we can somewhat accurately predict.

from copy import deepcopy
import pandas as pd
import numpy as np

from src.features.build_features import StockTechnicals

from src.models.backtest_strategy import TradeHoldStrategy

In [2]:
# load data using MSFT 
ticker = "MSFT"
all_daily_data = pd.read_csv(f'../data/{ticker}.csv')

In [3]:
# create a feature matrix and some labels using our handy StockTechnicals class
# we will actually only need labels for this decision, as we're using observed data
technicals = StockTechnicals(all_daily_data)
X = technicals.features
y = technicals.price_will_rise()

  dip[i] = 100 * (self._dip[i]/self._trs[i])
  din[i] = 100 * (self._din[i]/self._trs[i])


In [4]:
### BACKTESTING ###
# import a backtesting library
from pyalgotrade.barfeed import yahoofeed
from pyalgotrade.stratanalyzer import returns, trades

In [5]:
# build our backtesing feed using MSFT daily data
base_feed = yahoofeed.Feed()
base_feed.addBarsFromCSV(f"{ticker}", f'../data/{ticker}.csv')

In [6]:
# As a benchmark, we will buy 100 shares of MSFT on day one and hold. 

In [7]:
# buy one share on day one and hold as a benchmark
benchmark_feed = deepcopy(base_feed)
benchmark_trades = np.ones(len(all_daily_data))
benchmark_strat = TradeHoldStrategy(benchmark_feed, f'{ticker}', benchmark_trades)

# performance metrics for strategy evaluation
ret_analyzer = returns.Returns()
benchmark_strat.attachAnalyzer(ret_analyzer)
tradesAnalyzer = trades.Trades()
benchmark_strat.attachAnalyzer(tradesAnalyzer)

In [8]:
# backtest the strategy 
benchmark_strat.run()
bmk_value = round(benchmark_strat.getResult() - 1000000, 2)
print(f"Final portfolio increase: ${bmk_value}")
print(f"Total trades: {tradesAnalyzer.getCount()}")

Final portfolio increase: $13140.0
Total trades: 0


In [9]:
# Now let's backtest a strategy with the training labels we created in StockTechnicals. The 
# strategy is to BUY 100 shares if we predict the price will rise the following day and HOLD 
# until we predict the market will go down the following day, when we SELL 100 shares and 
# wait until we predict another rise.
# Note that there is some loss as we can't trade after-hours in our sim and open prices do not 
# always match closing prices, but it is still very good (results below)

In [10]:
perf_feed = deepcopy(base_feed)
perf_trades = np.concatenate([np.zeros(len(all_daily_data) - len(y)), y])
perf_strat = TradeHoldStrategy(perf_feed, f'{ticker}', perf_trades)

# performance metrics
ret_analyzer = returns.Returns()
perf_strat.attachAnalyzer(ret_analyzer)
tradesAnalyzer = trades.Trades()
perf_strat.attachAnalyzer(tradesAnalyzer)

In [11]:
# backtest the strategy
perf_strat.run()
perf_strat_value = round(perf_strat.getResult() - 1000000, 2)
print(f"Final portfolio increase: ${perf_strat_value}")
perf_pct_improve = round((perf_strat_value / bmk_value - 1) * 100, 1)
print(f"Percentage gain v. buy-and-hold bmk: {perf_pct_improve}%")
print("Total trades: %d" % (tradesAnalyzer.getCount()))

Final portfolio increase: $67320.0
Percentage gain v. buy-and-hold bmk: 412.3%
Total trades: 778


In [12]:
# As we can see, perfect execution of this strategy would improve our returns by over 400%
# However daily stock movement is notoriously difficult to predict, so we'll explore another
# strategy as well.

In [13]:
# Backtest another strategy. This strategy is to to BUY 100 shares if we predict the N-day SMA 
# will be higher than the current stock price in N days, and HOLD until we predict the N-day SMA 
# will be lower than the current stock price in N days, when we SELL 100 shares and wait until 
# we predict another rise.
N = 26

# generate our labels using our StockTechnicals class
y_sma = technicals.future_sma_higher_than_current_price(days=N)

sma_feed = deepcopy(base_feed)
sma_trades = np.concatenate([np.zeros(len(all_daily_data) - len(y_sma)), y_sma])
sma_strat = TradeHoldStrategy(sma_feed, f'{ticker}', sma_trades)

# performance metrics
ret_analyzer = returns.Returns()
sma_strat.attachAnalyzer(ret_analyzer)
tradesAnalyzer = trades.Trades()
sma_strat.attachAnalyzer(tradesAnalyzer)

In [14]:
# run the N-day sma strategy
sma_strat.run()
sma_value = round(sma_strat.getResult() - 1000000, 2)
print(f"Final portfolio increase: ${sma_value}")
sma_pct_improve = round((sma_value / bmk_value - 1) * 100, 1)
print(f"Percentage gain v. buy-and-hold bmk: {sma_pct_improve}%")
print("Total trades: %d" % (tradesAnalyzer.getCount()))

Final portfolio increase: $36330.0
Percentage gain v. buy-and-hold bmk: 176.5%
Total trades: 148


In [15]:
# As we can see with this strategy, the potential returns are not as high, but 
# it's more conservative and likely a bit easier to predict as it's using a more general metrics
# in the moving average rather than daily close data. We will demonstrate this next.

In [16]:
### EYEBALLING FOR FEASIBILITY ###
# Using a naive, unoptimized Logistic Regression model, we will evaluate benchmark peformances for 
# each of the two strategies outlined above.

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [18]:
# TEST NEXT DAY STRATEGY IN LOGISTIC REGRESSION

# set up X_daily and y_daily (y has trailing nan values, so we truncate here) 
y_daily = y[~np.isnan(y)]
X_daily = X[:len(y_daily)]

# preprocess the data
X_train, X_test, y_train, y_test = train_test_split(X_daily, y_daily, random_state=2, stratify=y_daily)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# run a logistic regression
for c in [0.01, 0.1, 1.0, 10, 100]:
    lr = LogisticRegression(C=c, random_state=2, solver='liblinear')
    lr.fit(X_train_std, y_train)
    y_pred = lr.predict(X_test_std)

    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

Accuracy score: 0.5100671140939598
Accuracy score: 0.508724832214765
Accuracy score: 0.5100671140939598
Accuracy score: 0.5208053691275167
Accuracy score: 0.5248322147651007


In [19]:
# As we can see above, a naive LogisticRegression doesn't do any better than simply guessing
# whether the stock will rise the following day

In [20]:
# TEST N-DAY SMA STRATEGY IN LOGISTIC REGRESSION

y_sma = y_sma[~np.isnan(y_sma)]
X_sma = X[:len(y_sma)]

# preprocess the data
X_train, X_test, y_train, y_test = train_test_split(X_sma, y_sma, random_state=2, stratify=y_sma)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

# run a logistic regression
for c in [0.01, 0.1, 1.0, 10, 100]:
    lr = LogisticRegression(C=c, random_state=2, solver='liblinear')
    lr.fit(X_train_std, y_train)
    y_pred = lr.predict(X_test_std)

    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

Accuracy score: 0.6589986468200271
Accuracy score: 0.6901217861975643
Accuracy score: 0.6820027063599459
Accuracy score: 0.6860622462787551
Accuracy score: 0.6833558863328822


In [21]:
# Compared to predicting the one-day return, predicting the 26-day sma rise is much more
# promising. Our _very_ naive Logistic Regression classifer scores at almost 64%. For this
# reason, we will move forward attempting to predict these labels.

In [22]:
# DELETE ME
# JUST A DEMO
from sklearn.neural_network import MLPClassifier

for a in [0.1, 1.0, 10, 100]:
    mlp = MLPClassifier(hidden_layer_sizes=(7, 3), solver='lbfgs', random_state=2, alpha=a, activation='tanh')
    mlp.fit(X_train_std, y_train)
    y_pred = mlp.predict(X_test_std)

    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Accuracy score: 0.7983761840324763
Accuracy score: 0.8051420838971584
Accuracy score: 0.8105548037889039
Accuracy score: 0.6332882273342354
