In [None]:
!pip install yfinance



First Load required libraries

In [None]:
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

**Step1:** First, let’s download the data for a security, say ‘AAPL’, from the beginning of 2015 until the end of last year:

In [None]:
# Download historical data for desired ticker symbol
aapl_data = yf.download('AAPL', start='2015-01-01', end='2023-12-31')

[*********************100%%**********************]  1 of 1 completed


**Step2:** **Strategy 1:** If the next trading day’s close price is greater than today’s close price then the signal is ‘buy’ (+1), otherwise ‘sell’ (-1).

In [None]:
# Create a new column 'Signal' such that if the next day's closing price is greater than today's,
# it will be 1 (buy), else -1 (sell)
aapl_data['S1_signal'] = np.where(aapl_data['Close'].shift(-1) > aapl_data['Close'], 1, -1)


**Step3:** **Strategy 2:** Utilize the 50-day moving average vs the 200-day moving average. A golden cross (or golden crossover) is a chart pattern that involves a short-term moving average crossing above a long-term moving average. Typically, the 50-day MA is used as the short-term average, and the 200-day MA is used as the long-term average. This is an indicator of bullish (buying) signal. A death cross is basically the opposite of a golden cross. It’s a chart pattern where a short-term MA crosses below a long-term MA. For example, the 50-day MA crosses below the 200-day MA. As such, a death cross is typically considered to be a bearish (selling) signal.

In [None]:
# Calculate the short-term simple moving average (SMA)
aapl_data['50_MA'] = aapl_data['Close'].rolling(window=50).mean()

# Calculate the long-term SMA
aapl_data['200_MA'] = aapl_data['Close'].rolling(window=200).mean()

# Create a new column 'golden_cross' that is True when the short-term SMA is greater than the long-term SMA
aapl_data['golden_cross'] = aapl_data['50_MA'] > aapl_data['200_MA']

# Create a new column 'death_cross' that is True when the short-term SMA is less than the long-term SMA
aapl_data['death_cross'] = aapl_data['50_MA'] < aapl_data['200_MA']

# Create a new column 'S2_signal' that is the difference between 'golden_cross' and 'death_cross'
aapl_data['S2_signal'] = aapl_data['golden_cross'].astype(int) - aapl_data['death_cross'].astype(int)



**Step4:** pre-process the data and split it into training and test datasets

In [None]:
aapl_data.head(30)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,S1_signal,50_MA,200_MA,golden_cross,death_cross,S2_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-02,27.8475,27.860001,26.8375,27.3325,24.435266,212818400,-1,,,False,False,0
2015-01-05,27.0725,27.1625,26.352501,26.5625,23.746895,257142000,1,,,False,False,0
2015-01-06,26.635,26.8575,26.157499,26.565001,23.749126,263188400,1,,,False,False,0
2015-01-07,26.799999,27.049999,26.674999,26.9375,24.082138,160423600,1,,,False,False,0
2015-01-08,27.307501,28.0375,27.174999,27.9725,25.007431,237458000,1,,,False,False,0
2015-01-09,28.1675,28.3125,27.5525,28.002501,25.03425,214798000,-1,,,False,False,0
2015-01-12,28.15,28.157499,27.200001,27.3125,24.417393,198603200,1,,,False,False,0
2015-01-13,27.8575,28.200001,27.227501,27.555,24.634188,268367600,-1,,,False,False,0
2015-01-14,27.26,27.622499,27.125,27.450001,24.540314,195826400,-1,,,False,False,0
2015-01-15,27.5,27.514999,26.665001,26.705,23.874287,240056000,-1,,,False,False,0


In [None]:
# Drop the rows with NaN values
aapl_data = aapl_data.dropna()

In [None]:
aapl_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,S1_signal,50_MA,200_MA,golden_cross,death_cross,S2_signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-10-16,27.945,28.0,27.6325,27.76,25.132725,156930400,1,28.0869,30.332337,False,True,-1
2015-10-19,27.700001,27.9375,27.5275,27.932501,25.288898,119036800,1,28.06795,30.335337,False,True,-1
2015-10-20,27.834999,28.5425,27.705,28.442499,25.750631,195871200,-1,28.0382,30.344737,False,True,-1
2015-10-21,28.5,28.895,28.424999,28.440001,25.748367,167180800,1,28.03955,30.354112,False,True,-1
2015-10-22,28.5825,28.875,28.525,28.875,26.142197,166616400,1,28.04085,30.3638,False,True,-1


For Strategy 1 spliting data into train and test

Feature are ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
Target variable ['S1_signal']



In [None]:
# Define the Feature Variable 'X', and the Label/Target variable 'y'
X_strategy1 = aapl_data.drop(['S1_signal', 'golden_cross', 'death_cross', '50_MA', '200_MA', 'S2_signal'], axis=1)
y = aapl_data[['S1_signal']]

# Split the data into training and test datasets (80/20 percent ratio)
X_train, X_test, y_train, y_test = train_test_split(X_strategy1, y, test_size=0.2, random_state=42)

In [None]:
X_strategy1

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-10-16,27.945000,28.000000,27.632500,27.760000,25.132725,156930400
2015-10-19,27.700001,27.937500,27.527500,27.932501,25.288898,119036800
2015-10-20,27.834999,28.542500,27.705000,28.442499,25.750631,195871200
2015-10-21,28.500000,28.895000,28.424999,28.440001,25.748367,167180800
2015-10-22,28.582500,28.875000,28.525000,28.875000,26.142197,166616400
...,...,...,...,...,...,...
2023-12-22,195.179993,195.410004,192.970001,193.600006,193.353287,37122800
2023-12-26,193.610001,193.889999,192.830002,193.050003,192.803986,28919300
2023-12-27,192.490005,193.500000,191.089996,193.149994,192.903839,48087700
2023-12-28,194.139999,194.660004,193.169998,193.580002,193.333298,34049900


In [None]:
y_train

Unnamed: 0_level_0,S1_signal
Date,Unnamed: 1_level_1
2016-07-26,1
2022-06-06,1
2023-08-22,1
2022-11-11,-1
2017-01-27,-1
...,...
2022-04-20,-1
2020-02-25,1
2020-04-15,1
2020-12-07,1


In [None]:
#Define classifiers
classifiers = [KNeighborsClassifier(),
               RandomForestClassifier(),
               GradientBoostingClassifier(),
               SVC(),
               XGBClassifier()
               ]

### Fit the classifiers on the training dataset and evaluate them on the test dataset#

In [None]:
# Fit the classifiers
for clf in classifiers:
    if isinstance(clf, XGBClassifier):
        # Map the target variable from [-1, 1] to [0, 1] for XGBoost because
        #XGBoost, by default, treats binary classification problems as if they have classes [0, 1].
        y_train_mapped = y_train["S1_signal"].map({-1: 0, 1: 1})
        y_test_mapped = y_test["S1_signal"].map({-1: 0, 1: 1})
        clf.fit(X_train, y_train_mapped)
        y_pred = clf.predict(X_test)
        print(f'{clf.__class__.__name__} Accuracy: {accuracy_score(y_test_mapped, y_pred)}')
    else:
        clf.fit(X_train, y_train["S1_signal"])
        y_pred = clf.predict(X_test)
        print(f'{clf.__class__.__name__} Accuracy: {accuracy_score(y_test["S1_signal"], y_pred)}')


KNeighborsClassifier Accuracy: 0.5060532687651331
RandomForestClassifier Accuracy: 0.5205811138014528
GradientBoostingClassifier Accuracy: 0.5036319612590799
SVC Accuracy: 0.549636803874092
XGBClassifier Accuracy: 0.5036319612590799


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Instantiate the grid search for XGBoost
xgb_grid_search = GridSearchCV(XGBClassifier(), xgb_param_grid, cv=5)

# Fit the grid search
xgb_grid_search.fit(X_train, y_train_mapped)

# Print the best parameters and the best score for XGBoost
print(f'XGBoost Best parameters: {xgb_grid_search.best_params_}')
print(f'XGBoost Best score: {xgb_grid_search.best_score_}')

# Define the parameter grid for RandomForest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Instantiate the grid search for RandomForest
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5)

# Fit the grid search
rf_grid_search.fit(X_train, y_train["S1_signal"])

# Print the best parameters and the best score for RandomForest
print(f'RandomForest Best parameters: {rf_grid_search.best_params_}')
print(f'RandomForest Best score: {rf_grid_search.best_score_}')


XGBoost Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
XGBoost Best score: 0.5302645793280234
RandomForest Best parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
RandomForest Best score: 0.4975739265769478


# For Strategy 2 spliting data into train and test

Feature are ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'golden_cross', 'death_cross', '50_MA', '200_MA']
Target variable ['S2_signal']


In [None]:
# Define the Feature Variable 'X', and the Label/Target variable 'y'
X_strategy2 = aapl_data.drop(['S1_signal', 'S2_signal'], axis=1)
y = aapl_data[['S2_signal']]

# Split the data into training and test datasets (80/20 percent ratio)
X_train, X_test, y_train, y_test = train_test_split(X_strategy2, y, test_size=0.2, random_state=42)

In [None]:
X_strategy2

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,50_MA,200_MA,golden_cross,death_cross
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-10-16,27.945000,28.000000,27.632500,27.760000,25.132725,156930400,28.086900,30.332337,False,True
2015-10-19,27.700001,27.937500,27.527500,27.932501,25.288898,119036800,28.067950,30.335337,False,True
2015-10-20,27.834999,28.542500,27.705000,28.442499,25.750631,195871200,28.038200,30.344737,False,True
2015-10-21,28.500000,28.895000,28.424999,28.440001,25.748367,167180800,28.039550,30.354112,False,True
2015-10-22,28.582500,28.875000,28.525000,28.875000,26.142197,166616400,28.040850,30.363800,False,True
...,...,...,...,...,...,...,...,...,...,...
2023-12-22,195.179993,195.410004,192.970001,193.600006,193.353287,37122800,185.398801,178.649100,True,False
2023-12-26,193.610001,193.889999,192.830002,193.050003,192.803986,28919300,185.682801,178.871851,True,False
2023-12-27,192.490005,193.500000,191.089996,193.149994,192.903839,48087700,185.971400,179.085250,True,False
2023-12-28,194.139999,194.660004,193.169998,193.580002,193.333298,34049900,186.300001,179.290201,True,False


In [None]:
y

Unnamed: 0_level_0,S2_signal
Date,Unnamed: 1_level_1
2015-10-16,-1
2015-10-19,-1
2015-10-20,-1
2015-10-21,-1
2015-10-22,-1
...,...
2023-12-22,1
2023-12-26,1
2023-12-27,1
2023-12-28,1


In [None]:
# Fit the classifiers
for clf in classifiers:
    if isinstance(clf, XGBClassifier):
        # Map the target variable from [-1, 1] to [0, 1] for XGBoost because
        #XGBoost, by default, treats binary classification problems as if they have classes [0, 1].
        y_train_mapped = y_train["S2_signal"].map({-1: 0, 1: 1})
        y_test_mapped = y_test["S2_signal"].map({-1: 0, 1: 1})
        clf.fit(X_train, y_train_mapped)
        y_pred = clf.predict(X_test)
        print(f'{clf.__class__.__name__} Accuracy: {accuracy_score(y_test_mapped, y_pred)}')
    else:
        clf.fit(X_train, y_train["S2_signal"])
        y_pred = clf.predict(X_test)
        print(f'{clf.__class__.__name__} Accuracy: {accuracy_score(y_test["S2_signal"], y_pred)}')


KNeighborsClassifier Accuracy: 0.711864406779661
RandomForestClassifier Accuracy: 1.0
GradientBoostingClassifier Accuracy: 1.0
SVC Accuracy: 0.7820823244552058
XGBClassifier Accuracy: 1.0


**Based on the performance results of both strategies:**

*   **Strategy 1** showed a mixed range of accuracy across different classifiers, with the highest accuracy being around 55% (SVC). This suggests that predicting the next day's price movements based solely on whether today's close is higher or lower than tomorrow's close may not provide a consistently reliable signal for trading. The strategy's effectiveness seems to be near or slightly above random chance, indicating potential challenges in capturing profitable trades due to market noise and the inherent unpredictability of short-term price movements.

* **Strategy 2**, utilizing the golden cross and death cross signals derived from the 50-day and 200-day moving averages, demonstrated significantly higher accuracy, with several classifiers achieving perfect or near-perfect accuracy scores. This suggests that for the historical data analyzed, Strategy 2 provided a much clearer and more reliable indication of bullish or bearish market trends, enabling a more accurate prediction of trading signals.

**Summary:** Strategy 2 appears to be the superior approach based on the provided results, offering more robust and reliable signals for trading decisions compared to Strategy 1. The use of moving averages to capture longer-term market trends seems to provide a stronger foundation for making profitable trades, reducing the impact of daily market volatility and noise. However, it's essential to consider the risk of overfitting given the perfect scores with some classifiers and to validate the strategy further with out-of-sample testing and cross-validation to ensure its effectiveness in different market conditions.


# **Fine Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Instantiate the grid search for XGBoost
xgb_grid_search = GridSearchCV(XGBClassifier(), xgb_param_grid, cv=5)

# Fit the grid search
xgb_grid_search.fit(X_train, y_train_mapped)

# Print the best parameters and the best score for XGBoost
print(f'XGBoost Best parameters: {xgb_grid_search.best_params_}')
print(f'XGBoost Best score: {xgb_grid_search.best_score_}')

# Define the parameter grid for RandomForest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Instantiate the grid search for RandomForest
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5)

# Fit the grid search
rf_grid_search.fit(X_train, y_train["S2_signal"])

# Print the best parameters and the best score for RandomForest
print(f'RandomForest Best parameters: {rf_grid_search.best_params_}')
print(f'RandomForest Best score: {rf_grid_search.best_score_}')


XGBoost Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
XGBoost Best score: 1.0
RandomForest Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
RandomForest Best score: 1.0
