In [111]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from data import yfinance_data # we can use methods from here to interact with yfinance

# Get historical data
`get_data()` from "src/data/yfinance_data.py" contains this method which pulls the historical data from the yfinance API to get all historical data associated with the stock since inception

In [113]:
df = yfinance_data.get_data("AMZN") 
df

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,AMZN,AMZN,AMZN,AMZN,AMZN
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1997-05-15,0.097917,0.125000,0.096354,0.121875,1443120000
1997-05-16,0.086458,0.098958,0.085417,0.098438,294000000
1997-05-19,0.085417,0.088542,0.081250,0.088021,122136000
1997-05-20,0.081771,0.087500,0.081771,0.086458,109344000
1997-05-21,0.071354,0.082292,0.068750,0.081771,377064000
...,...,...,...,...,...
2025-05-02,189.979996,192.880005,186.399994,191.440002,77903500
2025-05-05,186.350006,188.179993,185.529999,186.509995,35217500
2025-05-06,185.009995,187.929993,183.850006,184.570007,29314100
2025-05-07,188.710007,190.990005,185.009995,185.559998,44002900


# Add technical indicators
Since `get_data()` returns OHLCV data we can develop some technical indicators in new columns using this data alone, this is done in `process_data()` where we can use `NumPy` to easily add these new columns. We will also drop any null valued rows as well during this time.

In [76]:
df = yfinance_data.process_data(df) 
df

Price,Close,High,Low,Open,Volume,SMA(20),SMA(50),SMA(200)
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1981-09-28,0.049363,0.049792,0.049363,0.049363,91728000,0.063614,0.072834,0.090862
1981-09-29,0.051938,0.052367,0.051938,0.051938,94684800,0.062755,0.072216,0.090628
1981-09-30,0.052367,0.052797,0.052367,0.052367,49996800,0.061704,0.071615,0.090422
1981-10-01,0.052367,0.052797,0.052367,0.052367,61129600,0.060588,0.071109,0.090251
1981-10-02,0.056660,0.057090,0.056660,0.056660,45046400,0.059880,0.070645,0.090090
...,...,...,...,...,...,...,...,...
2025-05-02,205.350006,206.990005,202.160004,206.089996,101010600,199.663002,215.589601,226.786334
2025-05-05,198.889999,204.100006,198.210007,203.100006,69018500,200.188502,214.656401,226.640217
2025-05-06,198.509995,200.649994,197.020004,198.210007,51216500,201.041001,213.684601,226.515622
2025-05-07,196.250000,199.440002,193.250000,199.169998,68616900,202.232501,212.668801,226.379079


# Add Signals
We need to add signals to tell the bot where we should buy and sell. We can add a new column to signal this. For simplicity we will use the golden cross for "buy" and death cross for "sell". This will serve as the label.

In [108]:
df["Signal"] = np.where(
    (df['SMA(50)'].shift(1) > df['SMA(200)'].shift(1)) & (df['SMA(50)'] < df['SMA(200)']), -1, # If today's SMA(50) is below the SMA(200) from yesterday
    np.where(
        (df['SMA(50)'].shift(1) < df['SMA(200)'].shift(1)) & (df['SMA(50)'] > df['SMA(200)']), 1,
        0
    )
)

df["Signal"] = np.where(
    (df['SMA(20)'].shift(1) > df['SMA(50)'].shift(1)) & (df['SMA(20)'] < df['SMA(50)']), -1, # If today's SMA(20) is below the SMA(50) from yesterday
    np.where(
        (df['SMA(20)'].shift(1) < df['SMA(50)'].shift(1)) & (df['SMA(20)'] > df['SMA(50)']), 1,
        0
    )
)
len(df[df['Signal'] != 0])
df

Price,Close,High,Low,Open,Volume,SMA(20),SMA(50),SMA(200),Signal
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1981-09-28,0.049363,0.049792,0.049363,0.049363,91728000,0.063614,0.072834,0.090862,0
1981-09-29,0.051938,0.052367,0.051938,0.051938,94684800,0.062755,0.072216,0.090628,0
1981-09-30,0.052367,0.052797,0.052367,0.052367,49996800,0.061704,0.071615,0.090422,0
1981-10-01,0.052367,0.052797,0.052367,0.052367,61129600,0.060588,0.071109,0.090251,0
1981-10-02,0.056660,0.057090,0.056660,0.056660,45046400,0.059880,0.070645,0.090090,0
...,...,...,...,...,...,...,...,...,...
2025-05-02,205.350006,206.990005,202.160004,206.089996,101010600,199.663002,215.589601,226.786334,0
2025-05-05,198.889999,204.100006,198.210007,203.100006,69018500,200.188502,214.656401,226.640217,0
2025-05-06,198.509995,200.649994,197.020004,198.210007,51216500,201.041001,213.684601,226.515622,0
2025-05-07,196.250000,199.440002,193.250000,199.169998,68616900,202.232501,212.668801,226.379079,0


# Construct ML model
Once we have a dataframe with the technical indicators we would like to use, we can construct the ML model. We will use scikit-learn to simplify the process. Here we will scale the data, set up the pipeline, and split the data up. 

In [79]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression # Various ML models 
from sklearn.preprocessing import StandardScaler, QuantileTransformer # Import for scaling the data
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ("scale", StandardScaler()),
    ("model", LinearRegression())
])






# Train ML Model
We will test the model on the training data which will be a chunk of the dataframe we got from yfinance. Then once trained we can put it another chunk for validation data and adjust parameters as needed. Finally we can run the model on the test data for final analysis. 

# Export Model 
Once the model is trained we can export it using the `pickle` library or another equivalent. We can then use this in a driver where 