In [None]:
# Price prediction model
# Goal: Predicting whether or not the S&P500 will be bullish
# Backtesting timeframe: 20 years
# **libs**
# yfinance(StockData),


In [None]:
pip install yfinance


In [None]:
import yfinance as yf



In [None]:
# Retrieve sp500 dataFrame from yfinance

sp500 = yf.Ticker("^GSPC").history(period="max")


In [None]:
sp500

In [None]:
sp500.index


In [None]:
sp500.plot.line(y='Close',use_index=True)

In [None]:
del sp500['Dividends']
del sp500['Stock Splits']

In [None]:
sp500['Tomorrow'] = sp500["Close"].shift(-1)

In [None]:
sp500

In [None]:
# returns <boolean>True: if Tomorrow's price is greater than the previous Closing price
# the return must be converted into an int in order to be used in ml
sp500["Target"] = (sp500["Tomorrow"] >sp500["Close"]).astype(int)

In [None]:
sp500

In [None]:
sp500 = sp500.loc['1990-01-01':].copy()

In [None]:
sp500

In [None]:
pip install scikit-learn

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=100,min_samples_split=100,random_state=1 )
#training data is time_series data 
# note:using cross-validation with time_series data can lead to 
# inaccurate results,positive training data but negative live results

train = sp500.iloc[:-100]

test = sp500.iloc[-100:]

predictors =["Close","Volume","Open","High","Low"]
model.fit(train[predictors],train["Target"])



In [None]:
from sklearn.metrics import precision_score
preds = model.predict(test[predictors])


In [None]:
import pandas as pd

In [None]:
preds = pd.Series(preds,index=test.index)

In [None]:
preds

In [None]:
precision_score(test["Target"],preds)

In [None]:
combined = pd.concat([test["Target"],preds],axis=1)

In [None]:
combined.plot()

In [None]:
def predict(train,test,predictors,model):
    model.fit(train[predictors],train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index,name="Predictions")
    combined = pd.concat([test["Target"],preds],axis=1)
    return combined

In [None]:
def backtest(data,model,predictors,start=2500,step=250):
    all_predictions =[]
    for i in range(start, data.shape[0],step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i + step)].copy()
        predictions= predict(train,test,predictors,model)
        all_predictions.append(predictions)
    return pd.concat(all_predictions)
        

In [None]:
predictions = backtest(sp500,model,predictors)

In [None]:
predictions["Predictions"].value_counts()

In [None]:
precision_score(predictions["Target"],predictions["Predictions"])

In [None]:
predictions["Target"].value_counts()/predictions.shape[0]

In [None]:
# Add additional predictors to improve model
# Rolling avaerages are used as inputs holding past <Closing> prices based on days
# Goal is to compare todays price to these inputs
# horizons=[trading<days>]
horizons =[2,5,60,250,1000]
new_predictors =[]

for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()

    ratio_column = f"Close_Ratio_{horizon}"
    sp500[ratio_column] = sp500["Close"]/rolling_averages["Close"]

    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"] 
    new_predictors += [ratio_column,trend_column]

In [None]:
sp500

In [None]:
sp500 = sp500.dropna()

In [34]:
model = RandomForestClassifier(n_estimators=200,min_samples_split=50,random_state=1)

In [35]:
def predict(train,test,predictors,model):
    model.fit(train[predictors],train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds>= .6] =1
    preds[preds< .6] = 0
    preds = pd.Series(preds, index=test.index,name="Predictions")
    combined = pd.concat([test["Target"],preds],axis=1)
    return combined

In [36]:
predictions = backtest(sp500,model,new_predictors)

In [37]:
predictions["Predictions"].value_counts()

Predictions
0.0    4449
1.0     851
Name: count, dtype: int64

In [38]:
precision_score(predictions["Target"],predictions["Predictions"])

np.float64(0.5757931844888367)

In [39]:
# To extend model

"""
1) Check correlation between sp500 and other stocks from around the world, with the goal of
using the movement from <x> stock, would this be a valuable predictor for the sp500

"""

'\n'