In [19]:
import yfinance as yf
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib

# Load the S&P 500 data
sp500 = yf.download('^GSPC', start='1990-01-01', end='2023-01-01')

# Add moving averages and volatility for better results
sp500['MA50'] = sp500['Close'].rolling(window=50).mean()
sp500['MA200'] = sp500['Close'].rolling(window=200).mean()
sp500['Volatility'] = sp500['Close'].rolling(window=50).std()

# Drop NaN values (not relevant to our inputs)
sp500 = sp500.dropna().copy()

# Create the 'Tomorrow' column for the target variable
sp500['Tomorrow'] = sp500['Close'].shift(-1)

# Remove any NaN values
sp500 = sp500.dropna().copy()

# Create the target variable
sp500['Target'] = (sp500['Tomorrow'] > sp500['Close']).astype(int)

# Create the predictors
predictors = ['Close', 'Volume', 'Open', 'High', 'Low', 'MA50', 'MA200', 'Volatility']

# Ensure there are no NaN values in predictors
sp500 = sp500.dropna(subset=predictors).copy()

# Standardize the data
scaler = StandardScaler()
sp500[predictors] = scaler.fit_transform(sp500[predictors]).astype(float)

# Save the preprocessed data and scaler
sp500.to_csv('sp500_preprocessed.csv')
joblib.dump(scaler, 'scaler.pkl')

[*********************100%%**********************]  1 of 1 completed


['scaler.pkl']

In [20]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Load the preprocessed data
sp500 = pd.read_csv('sp500_preprocessed.csv', index_col=0)

# Define the model
model = RandomForestClassifier(n_estimators=100, min_samples_split=10, random_state=1)

# Train the model
model.fit(sp500[predictors], sp500['Target'])

# Evaluate the model using cross-validation
scores = cross_val_score(model, sp500[predictors], sp500['Target'], cv=5, scoring='precision')
print("Cross-validated precision scores:", scores)
print("Mean precision score:", scores.mean())

# Save the trained model
joblib.dump(model, 'model.pkl')

Cross-validated precision scores: [0.53419593 0.51811594 0.49524941 0.50949914 0.53589109]
Mean precision score: 0.5185903014424744


['model.pkl']

In [21]:
import yfinance as yf
import pandas as pd
import joblib

# Define the predictors
predictors = ['Close', 'Volume', 'Open', 'High', 'Low', 'MA50', 'MA200', 'Volatility']

# Load the most recent data up to today (Year-Month-Day)
recent_data = yf.download('^GSPC', start='2023-01-02', end='2024-08-06')

# Create relevant inputs for recent data
recent_data['MA50'] = recent_data['Close'].rolling(window=50).mean()
recent_data['MA200'] = recent_data['Close'].rolling(window=200).mean()
recent_data['Volatility'] = recent_data['Close'].rolling(window=50).std()
recent_data = recent_data.dropna().copy()  # Ensure we work with a copy

# Standardize the recent data (consistent)
scaler = joblib.load('scaler.pkl')
scaled_values = scaler.transform(recent_data[predictors]).astype('float64')
# Explicitly cast recent_data predictors columns to float64
recent_data[predictors] = recent_data[predictors].astype('float64')
recent_data.loc[:, predictors] = scaled_values

# Use the latest available data for prediction
latest_data = recent_data.iloc[-1]

# Create a DataFrame for the latest data
latest_data_df = pd.DataFrame(latest_data).T[predictors]

# Load the trained model and make the prediction for the next day
model = joblib.load('model.pkl')
prediction = model.predict(latest_data_df)

# Print the prediction
print("Prediction for the next day (1: Up, 0: Down):", prediction[0])

# Buy or do not buy
if prediction[0] == 0:
    print("Do not buy.")
elif prediction[0] == 1:
    print("Buy.")

# Debugging statements
print(recent_data.head())
print(latest_data_df.head())
print("Scaled values:\n", scaled_values)

[*********************100%%**********************]  1 of 1 completed

Prediction for the next day (1: Up, 0: Down): 0
Do not buy.
                Open      High       Low     Close    Adj Close    Volume  \
Date                                                                        
2023-10-18  2.847829  2.829976  2.821066  2.804588  4314.600098  0.662202   
2023-10-19  2.811569  2.805267  2.786449  2.767713  4278.000000  0.816377   
2023-10-20  2.763703  2.742162  2.739150  2.713467  4224.160156  0.835018   
2023-10-23  2.699777  2.721400  2.704878  2.706294  4217.040039  0.711150   
2023-10-24  2.725358  2.724947  2.735501  2.737165  4247.680176  0.735996   

                MA50     MA200  Volatility  
Date                                        
2023-10-18  2.920527  2.858740    1.038053  
2023-10-19  2.916673  2.861106    1.063558  
2023-10-20  2.911702  2.863041    1.129984  
2023-10-23  2.906685  2.865173    1.196546  
2023-10-24  2.901768  2.867011    1.212174  
               Close    Volume      Open      High       Low      MA50  \
2024-08-05 


