In [None]:
! pip install attention



In [None]:
# installation
! pip install yfinance --upgrade --no-cache-dir



In [None]:
import yfinance as yf

msft = yf.Ticker("MSFT")

# get stock info
msft.info

# get historical market data
hist = msft.history(period="max")
print(hist)

                  Open        High  ...  Dividends  Stock Splits
Date                                ...                         
1986-03-13    0.056001    0.064236  ...        0.0           0.0
1986-03-14    0.061491    0.064785  ...        0.0           0.0
1986-03-17    0.063687    0.065334  ...        0.0           0.0
1986-03-18    0.064785    0.065334  ...        0.0           0.0
1986-03-19    0.063138    0.063687  ...        0.0           0.0
...                ...         ...  ...        ...           ...
2021-11-01  331.359985  331.489990  ...        0.0           0.0
2021-11-02  330.309998  333.450012  ...        0.0           0.0
2021-11-03  333.899994  334.899994  ...        0.0           0.0
2021-11-04  332.890015  336.540009  ...        0.0           0.0
2021-11-05  338.510010  338.790009  ...        0.0           0.0

[8988 rows x 7 columns]


In [None]:
type(hist)

pandas.core.frame.DataFrame

In [None]:
hist.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'], dtype='object')

In [None]:
# download ticker data from nasdaq, manually extracted 
import urllib.request
url = 'https://gitlab.com/brainekt_ai/us-stock-ticker/-/raw/main/nasdaq_screener_1635374155108.csv'
filename = 'nasdaq_screener_1635374155108.csv'
urllib.request.urlretrieve(url, filename)

('nasdaq_screener_1635374155108.csv',
 <http.client.HTTPMessage at 0x7fa3981b3650>)

In [None]:
! ls

nasdaq_screener_1635374155108.csv  sample_data


In [None]:
import pandas as pd

tickers = pd.read_csv("nasdaq_screener_1635374155108.csv")

In [None]:
tickers.head()

Unnamed: 0,Symbol,Name,Last Sale,Net Change,% Change,Market Cap,Country,IPO Year,Volume,Sector,Industry
0,AACG,ATA Creativity Global American Depositary Shares,$2.28,-0.06,-2.564%,71539380.0,China,,75873,Miscellaneous,Service to the Health Industry
1,AACIU,Armada Acquisition Corp. I Unit,$9.95,0.0,0.00%,0.0,United States,2021.0,3,,
2,AADI,Aadi Bioscience Inc. Common Stock,$27.03,0.12,0.446%,564098300.0,United States,,33626,Health Care,Biotechnology: Pharmaceutical Preparations
3,AAL,American Airlines Group Inc. Common Stock,$19.03,-0.36,-1.857%,12322200000.0,United States,,22076823,Transportation,Air Freight/Delivery Services
4,AAME,Atlantic American Corporation Common Stock,$3.98,-0.1,-2.451%,81234840.0,United States,,14868,Finance,Life Insurance


In [None]:
tickers_ls = list(tickers["Symbol"])

In [None]:
# utils code
# assist

import os
import time

import tqdm
from tqdm import tqdm_notebook

## Data Processing
import pandas as pd
import numpy as np
import matplotlib as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# from sklearn.externals import joblib 


#### DATA CREATION FUNCTIONS ####
def create_data(file_list):
    """
    Utility function to create a dataset from a filelist.
    """
    counter = 1
    df_list = pd.DataFrame()
    for file in file_list:
        if (os.stat(file).st_size != 0):
            df = pd.read_csv(file, sep = ",")
            df['symbol'] = file
            df_list = df_list.append(df)
            print (counter, " out of ", len(file_list))
            counter += 1
    return pd.DataFrame(df_list)


def fetch_data():
    """
    Get the files from the data folder. 
    """
    main_dir = os.getcwd()
    # STOCKS
    os.chdir(main_dir)
    os.chdir("./data/Stocks")
    stock_list = os.listdir()
    stocks = create_data(stock_list)
    #ETFs
    os.chdir(main_dir)
    os.chdir("./data/ETFs")
    etf_list = os.listdir()
    etf = create_data(etf_list)

    return stocks, etf


#### DATA PROCESSING FUNCTIONS ####
def scale_df(data, model_name):
    """
    This class takes in a pandas dataframe and generates 
    the normalized version of it
    """
    # scales the data
    scaler = MinMaxScaler()
    df = scaler.fit_transform(data)
    
    return df, scaler


def generate_ta(data):
    """
    Runs ta on a dataset and saves to csv.
    """
    # converts data into ta dataframe
    df = add_all_ta_features(data, "Open", "High", "Low", "Close", "Volume", fillna=True)
    df.to_csv("../data/df_ta.csv")
    

def build_window(df, look_back, n_features):
    """
    Builds sliding windows to shift the batch by 1 step at a time
    """
    x_train = [] # This list contain the sequences to predict when training
    y_train = [] # This list contain the next value of the sequences when training

    for i in range(look_back, df.shape[0]):
        x_train.append(df[i-look_back:i,0:n_features].tolist()) # ,0 used in order to return the values only
        y_train.append(df[i,0].tolist()) # tolist() converts np array to simple array
   
    # Converting arrays from lists to np arrays. 
    x_train = np.array(x_train)
    y_train = np.array(y_train)

    # Rounding numbers to speed up training.
    x_train = np.round(x_train, 5)
    y_train = np.round(y_train, 5)

    return x_train, y_train


def trim_dataset(mat, batch_size):
    """
    trims dataset to a size that's divisible by the batch size
    """

    no_of_rows_drop = mat.shape[0] % batch_size
    if(no_of_rows_drop > 0):
        return mat[:-no_of_rows_drop]
    else:
        return mat

#### FINAL PIPELINE FUNCTION ####
def preproc_pipeline(data, name):
    """
    The preprocessing pipeline takes in a csv of processed data and creates
    the training, validation, and test sets
    """
    # Scale values
    data, scaler = scale_df(data, name)
    # Split
    train_set, testval_set = train_test_split(data, train_size=0.6, test_size=0.4, shuffle=False)
    validation_set, test_set = train_test_split(testval_set, train_size=0.7, test_size=0.3, shuffle=False)
    
    return train_set, validation_set, test_set, scaler


def model_preproc_pipeline(data, look_back, batch_size, n_features):
    """
    preprocesses data for LSTM input
    """
    x_train, y_train = build_window(data, look_back, n_features)

    x_train = trim_dataset(x_train, batch_size)
    y_train = trim_dataset(y_train, batch_size)

    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], n_features))
    return x_train, y_train
    

def generate_dataset():
        stocks, etf = create_data(".")
        data = pd.concat([stocks, etf])
        generate_ta(data)
        # we have to read file
        data = pd.read_csv("./df_ta.csv")

In [None]:
from tensorflow import keras as keras
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import CSVLogger

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, Input, Flatten, Add, Concatenate, Dot, Multiply, Bidirectional, GaussianNoise
from tensorflow.keras.layers import Maximum, Average, Activation

from attention import Attention
import tensorflow
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# attention: https://github.com/philipperemy/keras-attention-mechanism

def best_lstm_model(n_features, batch_size, look_back):
    """
    Returns a keras LSTM model. Our architecture will be kept 
    in this method.
    """
    x_i1 = Input((TIME_STEPS, 1), name='ip1')
    g = GaussianNoise(0.05, name='g')(x_i1)

    b1 = Bidirectional(LSTM(units = 64, return_sequences = True, name='l1'), name='b1')(x_i1)
    b1 = Bidirectional(LSTM(units = 64, return_sequences = True, name='l3'), name='b3')(b1)

    b2 = Bidirectional(LSTM(units = 64, return_sequences = True, name='l2'), name='b2')(g)
    b2 = Bidirectional(LSTM(units = 64, return_sequences = True, name='l4'), name='b4')(g)
    

    d = Add(name='add')([b1, b2])
    a = Attention(64, name='attn')(d)

    y = Dense(1)(a)

    model = Model(x_i1, y)

    model.compile(optimizer = 'adam', loss = 'mean_squared_error')

    return model

def scheduler(epoch, lr):
   if epoch < 5:
     return lr
   else:
     return lr * tensorflow.math.exp(-0.1)
     


def train_model(model, x_train, y_train, epochs, batch_size, lr):
    """
    Takes a training dataset and a model and returns a trained model 
    after ts timesteps.
    """
    callback = tensorflow.keras.callbacks.LearningRateScheduler(scheduler)
    model.fit(x_train, y_train, epochs = epochs, batch_size = batch_size, callbacks=[callback])

    return model

In [None]:
# Defining hyper parameters
TIME_STEPS = 100
BATCH_SIZE = 128
N_FEATURES = 1
lr = 0.0001 # learning rate
EPOCHS = 5
# result generation

In [None]:
regressor = best_lstm_model(N_FEATURES, BATCH_SIZE, TIME_STEPS)

In [None]:
regressor.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
ip1 (InputLayer)                [(None, 100, 1)]     0                                            
__________________________________________________________________________________________________
b1 (Bidirectional)              (None, 100, 128)     33792       ip1[0][0]                        
__________________________________________________________________________________________________
g (GaussianNoise)               (None, 100, 1)       0           ip1[0][0]                        
__________________________________________________________________________________________________
b3 (Bidirectional)              (None, 100, 128)     98816       b1[0][0]                         
___________________________________________________________________________________________

In [None]:
# Prediction model 1 based on George V Jose
from sklearn.metrics import mean_squared_error
def predict(data, num_prediction):
    # Scaling data
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(data)

    # Looking at last TIME_STEPS value
    prediction_list = scaled[-TIME_STEPS:]
    
    for _ in range(num_prediction):
        # Looking at last TIME_STEPS value (1 of our batch)
        x = prediction_list[-TIME_STEPS:]
        x = np.array(x)
        
        # 1 batch with TIME_STEPS and 1 feature
        x = x.reshape((1, TIME_STEPS, 1))
        
        # prediction
        print("predicting batch ", x)
        output = model.predict(x)[0][0]
        
        # Appends prediction results back into 
        prediction_list = np.append(prediction_list, output)
        
    prediction_list = prediction_list[TIME_STEPS - 1:]
    
    # Reverse the scaling
    prediction_list = prediction_list.reshape(num_prediction+1, 1)
    prediction_list = scaler.inverse_transform(prediction_list)
    
    return prediction_list

# Prediction model 2 based on Ravindra Compella
def moving_test_window_preds(data, num_predictions, TIME_STEPS, model):
    prediction_list = []
    
    # Scaling data
    scaler = MinMaxScaler()
    data = scaler.fit_transform(data)
    
    moving_test_window = data[-TIME_STEPS:]
    moving_test_window = np.array(moving_test_window)
    
    # Scaling data
    # scaler = MinMaxScaler()
    # moving_test_window = scaler.fit_transform(moving_test_window)
    
    # Reshaping data
    moving_test_window = moving_test_window.reshape((1, TIME_STEPS, 1))
    
    for i in range(num_predictions):
        # print("predicting batch ", moving_test_window)
        preds_one_step = model.predict(moving_test_window)
        prediction_list.append(preds_one_step[0,0])
        preds_one_step = preds_one_step.reshape(1,1,1)
        moving_test_window = np.concatenate((moving_test_window[:,1:,:], preds_one_step), axis=1)
        
    prediction_list = np.array(prediction_list)
    prediction_list = prediction_list.reshape(num_predictions, 1)
    prediction_list = scaler.inverse_transform(prediction_list)
    
    return prediction_list

In [None]:
top_companies = ['AMZN', 'AAPL', 'NFLX', 'GOOG', 'MSFT', 'GOOGL', 'TSLA', 'CSCO', 'COST', 'FB']

In [None]:
for i in range(len(top_companies)):
    comp = top_companies[i]
    comp_tick = yf.Ticker(comp)
    # get historical market data
    hist = comp_tick.history(period="max")
    print(f"{comp}: {len(hist) * 0.4 * 0.3 // 2}")

AMZN: 369.0
AAPL: 618.0
NFLX: 294.0
GOOG: 260.0
MSFT: 539.0
GOOGL: 260.0
TSLA: 171.0
CSCO: 479.0
COST: 534.0
FB: 143.0


In [None]:
# Defining hyper parameters
TIME_STEPS = 100
BATCH_SIZE = 128
N_FEATURES = 1
lr = 0.001 # learning rate
EPOCHS = 10
# result generation


for i in range(len(top_companies)):
    comp = top_companies[i]
    comp_tick = yf.Ticker(comp)
    # get historical market data
    hist = comp_tick.history(period="max")
    # train data
    # we only use closing price
    data = hist.iloc[:,1:2]
    train, valid, test, scalar = preproc_pipeline(data, False)
    # Create windows, trim windows, and reshape for LSTM input
    x_train, y_train = model_preproc_pipeline(train, TIME_STEPS, BATCH_SIZE, N_FEATURES)

    regressor = best_lstm_model(N_FEATURES, BATCH_SIZE, TIME_STEPS)

    # train model
    # Training the model
    regressor = train_model(regressor, x_train, y_train, EPOCHS, BATCH_SIZE, lr)

    # Preparing test and validation sets
    df_test = trim_dataset(test, BATCH_SIZE)
    df_val, df_testing = np.split(df_test, 2)

    n_samples = len(df_testing)

    unseen_predictions = moving_test_window_preds(df_val, n_samples, TIME_STEPS, regressor)

    # Evaluating model for unseen data
    mse = mean_squared_error(df_testing[:n_samples], unseen_predictions[:n_samples])
    n_mse = mse / (df_testing[:n_samples]).mean()
    print(f"comp: {comp} mse: {mse} norm_mse: {n_mse}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
comp: AMZN mse: 0.175170014927676 norm_mse: 0.2188735580096118
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
comp: AAPL mse: 0.1509098582634488 norm_mse: 0.2652021022604571
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
comp: NFLX mse: 0.08133008431080932 norm_mse: 0.10807957674840443
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
comp: GOOG mse: 0.08171771480503971 norm_mse: 0.10767618324347268
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
comp: MSFT mse: 0.13179036711091002 norm_mse: 0.22125475883187737
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
comp: GOO