# Downloading S&P 500 price data

In [1]:
import yfinance as yf
import pandas as pd
import os

In [2]:
if os.path.exists("sp500.csv"):
    sp500 = pd.read_csv("sp500.csv", index_col=0)
else:
    sp500 = yf.Ticker("^GSPC")
    sp500 = sp500.history(period="max")
    sp500.to_csv("sp500.csv")

In [3]:
sp500.index = pd.to_datetime(sp500.index)

In [4]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1950-01-03 00:00:00-05:00,16.660000,16.660000,16.660000,16.660000,1260000,0,0
1950-01-04 00:00:00-05:00,16.850000,16.850000,16.850000,16.850000,1890000,0,0
1950-01-05 00:00:00-05:00,16.930000,16.930000,16.930000,16.930000,2550000,0,0
1950-01-06 00:00:00-05:00,16.980000,16.980000,16.980000,16.980000,2010000,0,0
1950-01-09 00:00:00-05:00,17.080000,17.080000,17.080000,17.080000,2520000,0,0
...,...,...,...,...,...,...,...
2022-10-17 00:00:00-04:00,3638.649902,3689.729980,3638.649902,3677.949951,4352780000,0,0
2022-10-18 00:00:00-04:00,3746.260010,3762.790039,3686.530029,3719.979980,4483740000,0,0
2022-10-19 00:00:00-04:00,3703.110107,3728.580078,3666.510010,3695.159912,4223800000,0,0
2022-10-20 00:00:00-04:00,3689.050049,3736.000000,3656.439941,3665.780029,4496620000,0,0


# Cleaning and visualizing our stock market data

In [None]:
sp500.plot.line(y="Close", use_index=True)

In [None]:
del sp500["Dividends"]
del sp500["Stock Splits"]

# Setting our target for machine learning

In [None]:
sp500["Tomorrow"] = sp500["Close"].shift(-1)

In [None]:
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)

In [None]:
# Remove the data that came before 1990

sp500 = sp500.loc["1990-01-01":].copy()

In [None]:
sp500

# Training an initial machine learning model

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)

train = sp500.iloc[:-100]
test = sp500.iloc[-100:]

predictors = ["Close", "Volume", "Open", "High", "Low"]
model.fit(train[predictors], train["Target"])

In [None]:
from sklearn.metrics import precision_score

preds = model.predict(test[predictors])
preds = pd.Series(preds, index=test.index)
precision_score(test["Target"], preds)

In [None]:
combined = pd.concat([test["Target"], preds], axis=1)
combined.plot()

# Building a Backtesting System

In [None]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [None]:
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []

    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    
    return pd.concat(all_predictions)

In [None]:
predictions = backtest(sp500, model, predictors)

In [None]:
predictions["Predictions"].value_counts()

In [None]:
precision_score(predictions["Target"], predictions["Predictions"])

In [None]:
predictions["Target"].value_counts() / predictions.shape[0]

# Adding additionaly predictors to our model

In [None]:
horizons = [2,5,60,250,1000]
new_predictors = []

for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()
    
    ratio_column = f"Close_Ratio_{horizon}"
    sp500[ratio_column] = sp500["Close"] / rolling_averages["Close"]
    
    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]
    
    new_predictors+= [ratio_column, trend_column]

In [None]:
sp500 = sp500.dropna(subset=sp500.columns[sp500.columns != "Tomorrow"])

In [None]:
sp500

# Improving our model

In [None]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

In [None]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >=.6] = 1
    preds[preds <.6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [None]:
predictions = backtest(sp500, model, new_predictors)

In [None]:
predictions["Predictions"].value_counts()

In [None]:
precision_score(predictions["Target"], predictions["Predictions"])

In [None]:
predictions["Target"].value_counts() / predictions.shape[0]

In [None]:
predictions

Recommendation

- 