In [28]:
# useful article about OHLC data aggregation:
# applicable to websocket streams of tick by tick data
# https://blog.quantinsti.com/tick-tick-ohlc-data-pandas-tutorial/


In [29]:
# jupyter did not want to load already installed talib library 
# so had to reinstall it via anaconda as well 
#!conda install -c conda-forge ta-lib
import talib as ta
import yfinance as yf

In [30]:
import pandas as pd
import sqlite3

# custom function imports
from functions_ml import *
from functions_gen import *

# custom indicators moved to modules
from functions_superjump import *
from functions_HHLL import *
from functions_HHLL_conf import *

In [None]:
# ml model specific imports
from functions_forest import *       # Random Forest classifier
#from functions_nn import *          # Neural Net classifier, not needed when using forest

In [31]:
# sqlite database structure is following:
#
#sqlite> .header on
#sqlite> .mode column
#sqlite> select * from alpaca_websocket_stream_data LIMIT 10;
#timestamp                            symbol  price   size  exchange  conditions  tape  id   
#-----------------------------------  ------  ------  ----  --------  ----------  ----  -----
#2022-07-19 15:49:25.477387108-04:00  AAPL    150.8   100   V         ['@']       C     10807
#32022-07-19 15:49:27.252579851-04:00  AAPL    150.81  3     V         ['@', 'I']  C     10808
#2022-07-19 15:49:27.252579851-04:00  AAPL    150.81  100   V         ['@']       C     10809
#2022-07-19 15:49:27.666163652-04:00  AAPL    150.81  100   V         ['@']       C     10810
#2022-07-19 15:49:27.666164795-04:00  AAPL    150.81  200   V         ['@']       C     10811
#2022-07-19 15:49:29.248316808-04:00  AAPL    150.79  100   V         ['@']       C     10812
#2022-07-19 15:49:32.963910211-04:00  AAPL    150.78  35    V         ['@', 'I']  C     10813
#2022-07-19 15:49:36.611092454-04:00  AAPL    150.77  2     V         ['@', 'I']  C     10814
#2022-07-19 15:49:36.612940345-04:00  AAPL    150.77  100   V         ['@']       C     10815
#2022-07-19 15:49:37.083678369-04:00  AAPL    150.76  100   V         ['@']       C     10816
#sqlite> 





# Exploratory data wrangling
optionally uncomment the code to get insights to individual steps

In [32]:
# connect to sqlite database and get all data where symbol is AAPL
# symbol is external variable
# pandas to onnect to database and aggregate price data to 1 minute granularity in pandas

#symbol='AAPL'
#
#conn = sqlite3.connect("alpaca_websocket_stream_data.db")
#c = conn.cursor()
#c.execute("SELECT * FROM alpaca_websocket_stream_data WHERE symbol = ?", (symbol,))
#data = c.fetchall()
#conn.close()

In [33]:
#data

In [34]:
# LOAD ONE DAY BACK

# connect to sqlite database and get all data where symbol is AAPL
# timestamp is from 24 hours ago to now
#conn = sqlite3.connect("alpaca_websocket_stream_data.db")
#c = conn.cursor()
#c.execute("SELECT * FROM alpaca_websocket_stream_data WHERE symbol = 'AAPL' AND timestamp BETWEEN datetime('now', '-1 month') AND datetime('now')")
#data = c.fetchall()
#conn.close()

In [35]:
#data

In [36]:
# load data to dataframe
#df = pd.DataFrame(data, columns=["timestamp", "symbol", "price", "size", "exchange", "conditions", "tape", "id"])

In [37]:
#df

In [38]:
#df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ns")

In [39]:
#df["date"] = df["timestamp"].dt.date

In [40]:
#df

In [41]:
#df = df.set_index("timestamp")


In [42]:
#df_resampled = df['price'].resample("1Min").ohlc(_method='ohlc')

In [43]:
#df_resampled

In [44]:
#df = df.reset_index()
#df = df.sort_values(by=["symbol"])

# Making function flow
one function for getting the data for specific ticker from the database
another function to process the df into resampled df with 1 and 5 min granularity

In [45]:
symbol = 'AAPL'
db_name = 'alpaca_websocket_stream_data.db'
table_name= 'alpaca_websocket_stream_data'
granularity = '1Min'

In [46]:
def get_ticker_data_from_db_days_back(symbol, db_name, table_name):
    # load data n days back from db
    # connect to sqlite database and get all data where symbol is AAPL for example
    # timestamp is from 24 hours ago to now
    # symbol, database name, table name are external variables
    
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    c.execute(f"SELECT * FROM  {table_name} WHERE symbol = ? AND timestamp BETWEEN datetime('now', '-7 days') AND datetime('now')", (symbol,))
    data = c.fetchall()
    conn.close()

    return data

In [47]:
data =  get_ticker_data_from_db_days_back(symbol, db_name, table_name)

In [48]:
data

[]

In [49]:
def get_ticker_data_from_db(symbol, db_name, table_name):
    # connect to sqlite database and get all data where symbol is AAPL for example
    # symbol, database name, table name are external variables
    
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    #c.execute("SELECT * FROM ? WHERE symbol = ?", (table_name, symbol,))
    c.execute(f"SELECT * FROM {table_name} WHERE symbol = ?", (symbol,))
    data = c.fetchall()
    conn.close()

    return data

In [50]:
#data =  get_ticker_data_from_db(symbol, db_name, table_name)

In [51]:
#data

In [52]:
# so now we have data

In [53]:
def resample_data(data, granularity='1Min'):
    # takes incoming data, converst it to dataframe
    # and resamples tick by tick 'price' column into new dataframe, returns resampled dataframe
    # granularity can be 1Min, 5Min, other granularities are also possible
    
    # load data to dataframe
    df = pd.DataFrame(data, columns=["timestamp", "symbol", "price", "size", "exchange", "conditions", "tape", "id"])
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ns")
    df = df.set_index("timestamp")
    df_res = df['price'].resample(granularity).ohlc(_method='ohlc')

    # the neural net and Random Forest models expect df with columns called
    # 'Date','Open', 'High', 'Low', 'Close', 'Adj Close'
    # so we need to slightly mod our resampled df
    df_res.reset_index(inplace=True)
    df_res.rename(columns={"timestamp": "Date", "open": "Open", "high": "High", "low": "Low", "close": "Close"}, inplace=True)
    df_res["Adj Close"] = df_res["Close"]
    
    # data stream is not continuous, there are gaps between days, we need to remove the gaps
    # by taking only rows wit values that are not NaN
    # very important, otherwise indicators will compute wrong values
    df_res = df_res[df_res['Close'].notna()]
    
    #df_res.head()    

    return df_res    

In [55]:
data

[]

In [54]:
df_res =  resample_data(data, granularity=granularity)

TypeError: Cannot reset_index inplace on a Series to create a DataFrame

In [None]:
df_res

## Model training (Random Forest)

In [None]:
# not needed so far, we can import pretrained model

In [None]:
# but best to allow for training here as well

In [None]:
# training stock data
tickers = ['SPY', 'F', 'IBM', 'GE', 'AAPL', 'ADM'] 

In [None]:
training_sequence(tickers, interval="1m", model_name="./random_forest.joblib")

## Feature importance visualization 
(Random Forest only)

In [None]:
# load classifier, no need to initialize the loaded_rf
clf = joblib.load("./random_forest.joblib")

In [None]:
predictors_list = ['aboveSAR','aboveUpperBB','belowLowerBB','RSI','oversoldRSI','overboughtRSI',
                   'aboveEMA5','aboveEMA10','aboveEMA15','aboveEMA20','aboveEMA30','aboveEMA40','aboveEMA50',
                   'aboveEMA60','aboveEMA70','aboveEMA80','aboveEMA90',
                   'aboveEMA100','aboveEMA200',
                   'LongSig','ShortSig','WLongSig','WShortSig',
                   'HH','LL','HL','LH',
                   'trend_conf'
                  ]

In [None]:
plot_forest_feature_importances(clf, predictors_list)

# Predictions
- now we have dataframe and can compute whatever indicators we want
- and then connect it to our predictive model and even visualize in streamlit

In [None]:
# load classifier, no need to initialize the loaded_rf
clf = joblib.load("./random_forest.joblib")

In [None]:
df_res = compute_technical_indicators(df_res)
df_res = compute_features(df_res)
df_res =define_target_condition(df_res)

In [None]:
df_res.tail()

In [None]:
# --- plot whole dataframe ---
#df_res[['Open','High','Low','Close', 'EMA20']].plot()
df_res[['Open','High','Low','Close']].plot()

In [None]:
# --- plot tail of the dataframe ---
df_res[['Open','High','Low','Close']].iloc[-20:].plot()

In [None]:
#df_res

In [None]:
# we need only smaller slice
df_res_cut = df_res.iloc[-202:].copy()

In [None]:
#new_df = predict_timeseries(new_df, clf)
predict_timeseries(df_res_cut, clf)

In [None]:
# when there is short history, the starts of EMAs will be anchored to zero, that will skew the graph
# best to run when you have more than 1000 datapoints (in this case minutes) 
plot_stock_prediction(df_res_cut, symbol)

In [None]:
## add new column to better visualize Long only trades
#df_res_cut['Long'] = df_res_cut['Buy'] * df_res_cut['Adj Close'] 
#df_res_cut['Long'].replace(0, np.nan, inplace=True) 



In [None]:
#df_res_cut

In [None]:
# takes longer to compute, but predicts 200 datapoints to validate visually

def predict_timeseries(df, clf):

    # making sure we have good dimensions
    # column will be rewritten later
    df["Buy"] = np.nan

    print("df length: ", len(df))

    # for i in range(len(df)):
    #    print('above sar: ', df["aboveSAR"][i])

    # iterate over last 20 rows in a dataframe
    # use df.iterrows() to iterate over rows
    #for i, row in df.tail(
    #    20
    #).iterrows():  # predict for small subset of data, otherwise it takes too long

    for i, row in df.iterrows():    # predict for each row

        X_cls_valid = [
            [
                df["aboveSAR"][i],
                df["aboveUpperBB"][i],
                df["belowLowerBB"][i],
                df["RSI"][i],
                df["oversoldRSI"][i],
                df["overboughtRSI"][i],
                df["aboveEMA5"][i],
                df["aboveEMA10"][i],
                df["aboveEMA15"][i],
                df["aboveEMA20"][i],
                df["aboveEMA30"][i],
                df["aboveEMA40"][i],
                df["aboveEMA50"][i],
                df["aboveEMA60"][i],
                df["aboveEMA70"][i],
                df["aboveEMA80"][i],
                df["aboveEMA90"][i],
                df["aboveEMA100"][i],
            ]
        ]

        y_cls_pred_valid = clf.predict(X_cls_valid)
        df["Buy"][i] = y_cls_pred_valid[0].copy()

        print("step: ", i, "predicted class: ", df["Buy"][i])


    # add new column to better visualize Long only trades
    # graphs will look better, since no anchoring to zero for short trades
    df['Long'] = df['Buy'] * df['Adj Close'] 
    df['Long'].replace(0, np.nan, inplace=True) 

    print(df.tail())

    return df

In [None]:
df_res_cut

In [None]:
# inter trading day gaps in matplotlib issue proposed solution:
# https://stackoverflow.com/questions/39231410/ignoring-time-gaps-larger-than-x-mins-matplotlib-in-python

In [None]:
# reset index for better plotting without gaps with zooming function
df_res_cut.reset_index(inplace=True)    

In [None]:
def plot_stock_prediction_zoom(df, ticker):
    # --- plot only Long trades and zoom in on last data ---
    
    # plot  values and significant levels
    #df.reset_index(inplace=True)    
    
    # zoom in
    df = df.iloc[-200:]   # use eg. 50 for zooming in
    
    plt.figure(figsize=(20, 7))
    plt.title("Predictive model " + str(ticker))
    plt.plot(df.index, df["Adj Close"], label="High", alpha=0.4)

    plt.plot(df.index, df["EMA10"], label="EMA10", alpha=0.2)
    plt.plot(df.index, df["EMA20"], label="EMA20", alpha=0.2)
    plt.plot(df.index, df["EMA30"], label="EMA30", alpha=0.2)
    plt.plot(df.index, df["EMA40"], label="EMA40", alpha=0.2)
    plt.plot(df.index, df["EMA50"], label="EMA50", alpha=0.2)
    #plt.plot(df.index, df["EMA100"], label="EMA100", alpha=0.2)
    #plt.plot(df.index, df["EMA150"], label="EMA150", alpha=0.79)
    #plt.plot(df.index, df["EMA200"], label="EMA200", alpha=0.99)

    # this dataobject plotting gives intraday gaps since data from non trading time is not there
    #plt.scatter(
    #    df["Date"],
    #    #df["Buy"] * df["Adj Close"],
    #    df['Long'],
    #    label="Buy",
    #    marker="^",
    #    color="magenta",
    #    alpha=0.55,
    #)
    
    # workaround with plotting over index

    plt.scatter(
        df.index,
        #df["Buy"] * df["Adj Close"],
        df['Long'],
        label="Buy",
        marker="^",
        color="magenta",
        alpha=0.55,
    )    
    
    # avoid intraday gaps by overlaying timestamp values over index ticks
    plt.xticks(df.index, df['Date'], rotation='vertical')
    
    # make sure the x date ticks are not overlapping
    plt.locator_params(axis='x', nbins=15)
    
    #plt.xticks(x, labels, rotation='vertical')
    # Pad margins so that markers don't get clipped by the axes 
    #plt.margins(0.2)
    # Tweak spacing to prevent clipping of tick-labels
    #plt.subplots_adjust(bottom=0.15)

    plt.legend()
    plt.show()

    return None

In [None]:
plot_stock_prediction_zoom(df_res_cut, symbol)

# Daemonize the script to run every minute
but running script every minute by cron should be more reliable, also prevents time drift

In [None]:
# --- UNCOMMENT TO RUN CONTINUOUSLY ---

#while True:
#    
#    #data =  get_ticker_data_from_db(symbol, db_name, table_name)
#    data =  get_ticker_data_from_db_days_back(symbol, db_name, table_name)
#    df_res =  resample_data(data, granularity=granularity)
#    
#    df_res = compute_technical_indicators(df_res)
#    df_res = compute_features(df_res)
#    df_res =define_target_condition(df_res)
#
#    # streamline for pred and viz
#    df_res_cut = df_res.iloc[-202:].copy()
#    predict_timeseries(df_res_cut, clf)
#    plot_stock_prediction(df_res_cut, symbol)
#
#    time.sleep(60)
