In [8]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path

warnings.filterwarnings('ignore')

idx = pd.IndexSlice

In [6]:
def preprocess_data():

    prices = pd.read_hdf('/home/groovyjac/projects/autonomous-portfolio-management/main_data_store_JDKv1.h5',
                    'stocks/prices/daily').loc[idx[:, '1995':'2023'], ['adjusted_close', 'volume', 'RSI',
       'MACD', 'MACD_signal', 'MACD_hist', 'BB_upper', 'BB_middle', 'BB_lower']]
    
    # prices = (pd.read_hdf("home/groovyjac/projects/autonomous-portfolio-management/main_data_store_JDKv1.h5", 'stocks/prices/daily')
    #           .loc[idx[:, '2013':'2023'], ['adjusted_close', 'volume']])
    prices.index.names = ['ticker', 'date']
    n_dates = len(prices.index.unique('date'))
    
    # ... (include the remaining preprocessing code here)
    dollar_vol = (prices.adjusted_close.mul(prices.volume)
              .unstack('ticker')
              .dropna(thresh=int(.95 * n_dates), axis=1)
              .rank(ascending=False, axis=1)
              .stack('ticker'))

    most_traded = dollar_vol.groupby(level='ticker').mean().nsmallest(500).index

    returns = (prices.loc[idx[most_traded, :], 'adjusted_close']
            .unstack('ticker')
            .pct_change()
            .sort_index(ascending=False))
    returns.info()

    n = len(returns)
    T = 21 # days
    tcols = list(range(T))
    tickers = returns.columns

    data = pd.DataFrame()
    for i in range(n-T-1):
        df = returns.iloc[i:i+T+1]
        date = df.index.max()
        data = pd.concat([data, 
                        df.reset_index(drop=True).T
                        .assign(date=date, ticker=tickers)
                        .set_index(['ticker', 'date'])])
    data = data.rename(columns={0: 'label'}).sort_index().dropna()
    data.loc[:, tcols[1:]] = (data.loc[:, tcols[1:]].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                    upper=x.quantile(.99))))
    data.info()
    data.shape

    data.to_hdf('lstm_data.h5', 'returns_daily')

    with pd.HDFStore('lstm_data.h5') as store:
        print(store.info())

    prices = (pd.read_hdf(DATA_DIR / 'assets_v1.h5', 'stocks/prices/daily')
          .adjusted_close.swaplevel()
          .unstack().loc['2005':])
    prices.info()

    prices.index = pd.to_datetime(prices.index)

    returns = (prices
            .resample('W')
            .last()
            .pct_change()
            .loc['2006': '2023']
            .dropna(axis=1)
            .sort_index(ascending=False))
    returns.info()

    returns.head().append(returns.tail())

    n = len(returns)
    T = 52 # weeks
    tcols = list(range(T))
    tickers = returns.columns

    data = pd.DataFrame()
    for i in range(n-T-1):
        df = returns.iloc[i:i+T+1]
        date = df.index.max()    
        data = pd.concat([data, (df.reset_index(drop=True).T
                                .assign(date=date, ticker=tickers)
                                .set_index(['ticker', 'date']))])
    data.info()

    data[tcols] = (data[tcols].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                  upper=x.quantile(.99))))
    data = data.rename(columns={0: 'fwd_returns'})
    data['label'] = (data['fwd_returns'] > 0).astype(int)
    data.shape
    data.sort_index().to_hdf('lstm_data.h5', 'returns_weekly')
    return prices, returns
        
    #data.sort_index().to_hdf('lstm_data.h5', 'returns_weekly')

# if __name__ == "__main__":
#     preprocess_data()

In [3]:
prices, returns = preprocess_data()

<class 'pandas.core.frame.DataFrame'>
Index: 7050 entries, 2022-12-30 to 1995-01-03
Columns: 335 entries, MSFT to ROL
dtypes: float64(335)
memory usage: 18.1+ MB


KeyboardInterrupt: 

In [9]:
prices = pd.read_hdf('/home/groovyjac/projects/autonomous-portfolio-management/main_data_store_JDKv1.h5',
                    'stocks/prices/daily').loc[idx[:, '1995':'2023'], ['adjusted_close', 'volume', 'RSI',
       'MACD', 'MACD_signal', 'MACD_hist', 'BB_upper', 'BB_middle', 'BB_lower']]

In [10]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3045934 entries, ('AAPL', '1995-01-03') to ('NWS', '2022-12-30')
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   adjusted_close  float64
 1   volume          int64  
 2   RSI             float64
 3   MACD            float64
 4   MACD_signal     float64
 5   MACD_hist       float64
 6   BB_upper        float64
 7   BB_middle       float64
 8   BB_lower        float64
dtypes: float64(8), int64(1)
memory usage: 221.4+ MB


In [11]:
prices = (pd.read_hdf('/home/groovyjac/projects/autonomous-portfolio-management/main_data_store_JDKv1.h5', 'stocks/prices/daily')
          .loc[idx[:, '2013':'2023'], ['adjusted_close', 'volume']])
prices.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1234633 entries, ('AAPL', '2013-01-02') to ('NWS', '2022-12-30')
Data columns (total 2 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   adjusted_close  1234633 non-null  float64
 1   volume          1234633 non-null  int64  
dtypes: float64(1), int64(1)
memory usage: 24.2+ MB


In [15]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path

warnings.filterwarnings('ignore')

def preprocess_data_v2():
    idx = pd.IndexSlice

    # Load the dataset provided
    dataset = pd.read_hdf('/home/groovyjac/projects/autonomous-portfolio-management/main_data_store_JDKv1.h5',
                    'stocks/prices/daily').loc[idx[:, '1995':'2023'], ['adjusted_close', 'volume', 'RSI',
                    'MACD', 'MACD_signal', 'MACD_hist', 'BB_upper', 'BB_middle', 'BB_lower']]
    dataset.index.names = ['ticker', 'date']
    n_dates = len(dataset.index.unique('date'))

    # Select the most traded stocks based on dollar volume
    dollar_vol = (dataset.adjusted_close.mul(dataset.volume)
                  .unstack('ticker')
                  .dropna(thresh=int(.95 * n_dates), axis=1)
                  .rank(ascending=False, axis=1)
                  .stack('ticker'))

    most_traded = dollar_vol.groupby(level='ticker').mean().nsmallest(500).index

    # Calculate daily percentage returns for the most traded stocks
    returns = (dataset.loc[idx[most_traded, :], 'adjusted_close']
               .unstack('ticker')
               .pct_change()
               .sort_index(ascending=False))

    # Create a sliding window of length T (21 days) and store the data in a new DataFrame
    n = len(returns)
    T = 21 # days
    tcols = list(range(T))
    tickers = returns.columns

    data = pd.DataFrame()
    for i in range(n-T-1):
        df = returns.iloc[i:i+T+1]
        date = df.index.max()
        data = pd.concat([data, 
                        df.reset_index(drop=True).T
                        .assign(date=date, ticker=tickers)
                        .set_index(['ticker', 'date'])])

    # Add additional features to the data DataFrame
    data = data.rename(columns={0: 'label'}).sort_index().dropna()
    additional_features = ['RSI', 'MACD', 'MACD_signal', 'MACD_hist', 'BB_upper', 'BB_middle', 'BB_lower']
    data = data.join(dataset.loc[idx[most_traded, :], additional_features])

    # Clip extreme values in the dataset
    data.loc[:, tcols[1:]] = (data.loc[:, tcols[1:]].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                    upper=x.quantile(.99))))
    
    # Save the preprocessed data to an HDF5 file
    data.to_hdf('lstm_data_JDKv1.h5', 'returns_daily')

    return data

In [16]:
data = preprocess_data_v2()

KeyboardInterrupt: 

# Good

In [36]:
# Load the dataset provided
dataset = pd.read_hdf('/home/groovyjac/projects/autonomous-portfolio-management/main_data_store_JDKv1.h5',
                'stocks/prices/daily').loc[idx[:, '2013-02-15':'2023-02-15'], ['adjusted_close', 'volume', 'RSI',
                'MACD', 'MACD_signal', 'MACD_hist', 'BB_upper', 'BB_middle', 'BB_lower']]
dataset.index.names = ['ticker', 'date']
n_dates = len(dataset.index.unique('date'))

# Select the most traded stocks based on dollar volume
dollar_vol = (dataset.adjusted_close.mul(dataset.volume)
              .unstack('ticker')
              .dropna(thresh=int(.95 * n_dates), axis=1)
              .rank(ascending=False, axis=1)
              .stack('ticker'))

most_traded = dollar_vol.groupby(level='ticker').mean().nsmallest(500).index

# Calculate daily percentage returns for the most traded stocks
returns = (dataset.loc[idx[most_traded, :], 'adjusted_close']
           .unstack('ticker')
           .pct_change()
           .sort_index(ascending=False))

# Create a sliding window of length T (21 days) and store the data in a new DataFrame
n = len(returns)
T = 21 # days
tcols = list(range(T))
tickers = returns.columns

# Use list comprehension and concatenate the results
data_list = [returns.iloc[i:i+T+1].reset_index(drop=True).T
            .assign(date=returns.index[i], ticker=tickers)
            .set_index(['ticker', 'date'])
            for i in range(n-T-1)]

data = pd.concat(data_list)
data = data.rename(columns={0: 'label'}).sort_index().dropna()
data.loc[:, tcols[1:]] = (data.loc[:, tcols[1:]].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                    upper=x.quantile(.99))))

In [39]:
import pandas as pd
import numpy as np

class Preprocessing:
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
    
    def load_dataset(self):
        """
        Loads the dataset provided, selects a subset of columns, and assigns names to the index.
        """
        # load the dataset
        dataset = pd.read_hdf(self.dataset_path, 'stocks/prices/daily')
        
        # select a subset of columns
        dataset = dataset.loc[idx[:, '2013-02-15':'2023-02-15'], ['adjusted_close', 'volume', 'RSI', 'MACD', 'MACD_signal', 'MACD_hist', 'BB_upper', 'BB_middle', 'BB_lower']]
        
        # assign names to the index
        dataset.index.names = ['ticker', 'date']
        
        return dataset
    
    def select_most_traded_stocks(self, dataset):
        """
        Selects the most traded stocks based on dollar volume and returns a DataFrame of their daily percentage returns.
        """
        # get the number of unique dates in the dataset
        n_dates = len(dataset.index.unique('date'))
        
        # calculate the dollar volume for each stock and rank them based on dollar volume
        dollar_vol = (dataset.adjusted_close.mul(dataset.volume)
                      .unstack('ticker')
                      .dropna(thresh=int(.95 * n_dates), axis=1)
                      .rank(ascending=False, axis=1)
                      .stack('ticker'))

        # select the 500 most traded stocks based on the mean rank of dollar volume
        most_traded = dollar_vol.groupby(level='ticker').mean().nsmallest(500).index

        # calculate daily percentage returns for the most traded stocks
        returns = (dataset.loc[idx[most_traded, :], 'adjusted_close']
                   .unstack('ticker')
                   .pct_change()
                   .sort_index(ascending=False))

        return returns
    
    def create_sliding_window(self, returns, T):
        """
        Creates a sliding window of length T (21 days) for the given DataFrame of returns and returns a new DataFrame.
        """
        # get the number of rows in the returns DataFrame
        n = len(returns)

        # create a list of column names for the sliding window
        tcols = list(range(T))

        # get the tickers from the returns DataFrame
        tickers = returns.columns

        # create a list of DataFrames for each window
        data_list = [returns.iloc[i:i+T+1].reset_index(drop=True).T
                     .assign(date=returns.index[i], ticker=tickers)
                     .set_index(['ticker', 'date'])
                     for i in range(n-T-1)]

        # concatenate the list of DataFrames into a single DataFrame
        data = pd.concat(data_list)

        # rename the first column to 'label' and drop any rows with missing values
        data = data.rename(columns={0: 'label'}).sort_index().dropna()

        # clip the values of the remaining columns to the 1st and 99th percentiles
        data.loc[:, tcols[1:]] = (data.loc[:, tcols[1:]].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                                               upper=x.quantile(.99))))

        return data

In [41]:
preprocessing = Preprocessing('/home/groovyjac/projects/autonomous-portfolio-management/main_data_store_JDKv1.h5')

dataset = preprocessing.load_dataset()
returns = preprocessing.select_most_traded_stocks(dataset)

T = 21 # days

data = preprocessing.create_sliding_window(returns, T)

In [42]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,label,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,2013-03-20,0.016668,-0.012926,-0.010694,-0.007157,0.007441,0.008915,-0.004437,-0.005111,-0.005085,0.000230,...,0.002386,0.010848,-0.005993,0.018551,-0.007750,-0.012200,0.004082,-0.014441,-0.017902,0.017985
A,2013-03-21,-0.024591,0.016668,-0.012926,-0.010694,-0.007157,0.007441,0.008915,-0.004437,-0.005111,-0.005085,...,0.014990,0.002386,0.010848,-0.005993,0.018551,-0.007750,-0.012200,0.004082,-0.014441,-0.017902
A,2013-03-22,-0.009603,-0.024591,0.016668,-0.012926,-0.010694,-0.007157,0.007441,0.008915,-0.004437,-0.005111,...,0.013596,0.014990,0.002386,0.010848,-0.005993,0.018551,-0.007750,-0.012200,0.004082,-0.014441
A,2013-03-25,-0.002911,-0.009603,-0.024591,0.016668,-0.012926,-0.010694,-0.007157,0.007441,0.008915,-0.004437,...,0.000230,0.013596,0.014990,0.002386,0.010848,-0.005993,0.018551,-0.007750,-0.012200,0.004082
A,2013-03-26,0.016775,-0.002911,-0.009603,-0.024591,0.016668,-0.012926,-0.010694,-0.007157,0.007441,0.008915,...,-0.005085,0.000230,0.013596,0.014990,0.002386,0.010848,-0.005993,0.018551,-0.007750,-0.012200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZTS,2023-02-09,-0.007774,-0.021139,0.003880,-0.017104,-0.018713,0.019010,0.014019,0.004797,-0.002906,-0.018188,...,-0.005157,0.017948,0.011485,0.014756,0.002507,-0.008390,0.006316,0.012729,0.022933,0.049640
ZTS,2023-02-10,-0.015422,-0.007774,-0.021139,0.003880,-0.017104,-0.018713,0.019010,0.014019,0.004797,-0.002906,...,-0.002291,-0.005157,0.017948,0.011485,0.014756,0.002507,-0.008390,0.006316,0.012729,0.022933
ZTS,2023-02-13,0.030380,-0.015422,-0.007774,-0.021139,0.003880,-0.017104,-0.018713,0.019010,0.014019,0.004797,...,0.016494,-0.002291,-0.005157,0.017948,0.011485,0.014756,0.002507,-0.008390,0.006316,0.012729
ZTS,2023-02-14,0.053696,0.030380,-0.015422,-0.007774,-0.021139,0.003880,-0.017104,-0.018713,0.019010,0.014019,...,-0.018188,0.016494,-0.002291,-0.005157,0.017948,0.011485,0.014756,0.002507,-0.008390,0.006316


In [None]:
# data.to_hdf('lstm_data.h5', 'returns_daily')

# with pd.HDFStore('lstm_data.h5') as store:
#     print(store.info())

In [8]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path

warnings.filterwarnings('ignore')

idx = pd.IndexSlice

class Preprocessing:
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
    
    def load_dataset(self):
        """
        Loads the dataset provided, selects a subset of columns, and assigns names to the index.
        """
        # load the dataset
        dataset = pd.read_hdf(self.dataset_path, 'stocks/prices/daily')
        
        # select a subset of columns
        dataset = dataset.loc[idx[:, '2005-02-15':'2023-02-15'], ['adjusted_close', 'volume', 'RSI', 'MACD', 'MACD_signal', 'MACD_hist', 'BB_upper', 'BB_middle', 'BB_lower']]
        
        # assign names to the index
        dataset.index.names = ['ticker', 'date']
        
        return dataset
    
    def select_most_traded_stocks(self, dataset):
        """
        Selects the most traded stocks based on dollar volume and returns a DataFrame of their daily percentage returns.
        """
        # get the number of unique dates in the dataset
        n_dates = len(dataset.index.unique('date'))
        
        # calculate the dollar volume for each stock and rank them based on dollar volume
        dollar_vol = (dataset.adjusted_close.mul(dataset.volume)
                      .unstack('ticker')
                      .dropna(thresh=int(.95 * n_dates), axis=1)
                      .rank(ascending=False, axis=1)
                      .stack('ticker'))

        # select the 500 most traded stocks based on the mean rank of dollar volume
        most_traded = dollar_vol.groupby(level='ticker').mean().nsmallest(500).index

        dataset = dataset.loc[idx[most_traded, :], 'adjusted_close'].unstack('ticker')

        dataset.index = pd.to_datetime(dataset.index)

        # calculate weekly percentage returns for the most traded stocks
        returns = dataset.resample('W').last().pct_change().dropna(axis=0).sort_index(ascending=False)

        # convert the index to a datetime index
        returns.index = pd.to_datetime(returns.index)

        return returns
    
    def create_sliding_window(self, returns, T):
        """
        Creates a sliding window of length T (52 weeks) for the given DataFrame of returns and returns a new DataFrame.
        """
        # get the number of rows in the returns DataFrame
        n = len(returns)

        # create a list of column names for the sliding window
        tcols = list(range(1, T+1))

        # get the tickers from the returns DataFrame
        tickers = returns.columns

        # create a list of DataFrames for each window
        data_list = [returns.iloc[i:i+T+1].reset_index(drop=True).T
                     .assign(date=returns.index[i], ticker=tickers)
                     .set_index(['ticker', 'date'])
                     for i in range(n-T-1)]

        # concatenate the list of DataFrames into a single DataFrame
        data = pd.concat(data_list)

        # rename the first column to 'fwd_returns' and drop any rows with missing values
        data = data.rename(columns={0: 'fwd_returns'}).sort_index().dropna()

        # clip the values of the remaining columns to the 1st and 99th percentiles
        data.loc[:, tcols] = (data.loc[:, tcols].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                                         upper=x.quantile(.99))))

        # create a new column 'label' indicating whether the forward returns are positive
        data['label'] = (data['fwd_returns'] > 0).astype(int)

        return data
    

In [137]:
preprocessing = Preprocessing('/home/groovyjac/projects/autonomous-portfolio-management/main_data_store_JDKv1.h5')

dataset = preprocessing.load_dataset()

returns = preprocessing.select_most_traded_stocks(dataset)

T = 52 # weeks

data = preprocessing.create_sliding_window(returns, T)

In [138]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 354903 entries, ('A', Timestamp('2006-12-31 00:00:00')) to ('ZION', Timestamp('2023-02-19 00:00:00'))
Data columns (total 54 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   fwd_returns  354903 non-null  float64
 1   1            354903 non-null  float64
 2   2            354903 non-null  float64
 3   3            354903 non-null  float64
 4   4            354903 non-null  float64
 5   5            354903 non-null  float64
 6   6            354903 non-null  float64
 7   7            354903 non-null  float64
 8   8            354903 non-null  float64
 9   9            354903 non-null  float64
 10  10           354903 non-null  float64
 11  11           354903 non-null  float64
 12  12           354903 non-null  float64
 13  13           354903 non-null  float64
 14  14           354903 non-null  float64
 15  15           354903 non-null  float64
 16  16           354903 non-null  float6

In [146]:
data.index.get_level_values(0).nunique()

421

In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Input, Embedding, Reshape, BatchNormalization, concatenate
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

class LSTM_model:
    def __init__(self, data, window_size=52, lstm1_units=25, lstm2_units=10, embedding_dim=5):
        self.data = data
        self.window_size = window_size
        self.lstm1_units = lstm1_units
        self.lstm2_units = lstm2_units
        self.embedding_dim = embedding_dim
        self.model = None
        self.results_path = 'results/lstm_model'
    
    def prepare_data(self):
        """
        Prepare input data for the LSTM model and train-test split.
        """
        data = self.data.copy()
        data['ticker'] = pd.factorize(data.index.get_level_values('ticker'))[0]
        data['month'] = data.index.get_level_values('date').month
        data = pd.get_dummies(data, columns=['month'], prefix='month')
        
        window_size = self.window_size
        sequence = list(range(1, window_size+1))
        ticker = 1
        months = 12
        n_tickers = data.ticker.nunique()
        
        train_data = data.loc[idx[:, :'2016'], :]
        test_data = data.loc[idx[:, '2017'],:]
        
        X_train = [
            train_data.loc[:, sequence].values.reshape(-1, window_size , 1),
            train_data.ticker,
            train_data.filter(like='month')
        ]
        y_train = train_data.fwd_returns
        
        X_test = [
            test_data.loc[:, list(range(1, window_size+1))].values.reshape(-1, window_size , 1),
            test_data.ticker,
            test_data.filter(like='month')
        ]
        y_test = test_data.fwd_returns
        
        return X_train, y_train, X_test, y_test
    
    def create_model(self):
        """
        Create the LSTM model architecture.
        """
        K.clear_session()
        n_features = 1
        window_size = self.window_size
        n_tickers = self.data.index.get_level_values(0).nunique()
        
        # Input Layers
        returns = Input(shape=(window_size, n_features), name='Returns')
        tickers = Input(shape=(1,), name='Tickers')
        months = Input(shape=(12,), name='Months')
        
        # LSTM Layers
        lstm1 = LSTM(units=self.lstm1_units, 
                     input_shape=(window_size, n_features), 
                     name='LSTM1', 
                     dropout=.2,
                     return_sequences=True)(returns)
        lstm_model = LSTM(units=self.lstm2_units, 
                          dropout=.2,
                          name='LSTM2')(lstm1)
        
        # Embedding Layer
        ticker_embedding = Embedding(input_dim=n_tickers, 
                                     output_dim=self.embedding_dim, 
                                     input_length=1)(tickers)
        ticker_embedding = Reshape(target_shape=(self.embedding_dim,))(ticker_embedding)
        
        # Concatenate Model components
        merged = concatenate([lstm_model, ticker_embedding, months], name='Merged')
        bn = BatchNormalization()(merged)
        hidden_dense = Dense(10, name='FC1')(bn)
        output = Dense(1, name='Output')(hidden_dense)
        
        # Create and compile the model
        self.model = Model(inputs=[returns, tickers, months], outputs=output)
        print(self.model.summary())
        self.model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam())

    def train_model(self, X_train, y_train, X_test, y_test, epochs=50, batch_size=64, save_best_only=True, early_stopping_patience=5):
        """
        Train the LSTM model using the prepared input data.
        """
        # Create directory to save model if it does not exist
        Path(self.results_path).mkdir(parents=True, exist_ok=True)
        
        # Filepath to save the best model
        lstm_path = f'{self.results_path}/lstm.regression.h5'

        # Model checkpoint callback to save the best model based on validation loss
        checkpointer = ModelCheckpoint(filepath=lstm_path,
                                    verbose=1,
                                    monitor='val_loss',
                                    mode='min',
                                    save_best_only=save_best_only)

        # Early stopping callback to stop training if validation loss does not improve after certain number of epochs
        early_stopping = EarlyStopping(monitor='val_loss', 
                                    patience=early_stopping_patience,
                                    restore_best_weights=True)

        # Train the model
        training = self.model.fit(X_train,
                                y_train,
                                epochs=epochs,
                                batch_size=batch_size,
                                validation_data=(X_test, y_test),
                                callbacks=[early_stopping, checkpointer],
                                verbose=1)
        
        return training

    def evaluate_model(self, X_test, y_test):
        """
        Evaluate the model on the test dataset and return predictions.
        """
        # Load the best model saved during training
        self.model.load_weights(f'{self.results_path}/lstm.regression.h5')
        
        # Get model predictions on the test dataset
        test_predict = pd.Series(self.model.predict(X_test).squeeze(), index=y_test.index)
        
        return test_predict


In [10]:
#Create an instance of the Preprocessing class
preprocessing = Preprocessing('/home/groovyjac/projects/autonomous-portfolio-management/main_data_store_JDKv1.h5')

#Load and preprocess the dataset
dataset = preprocessing.load_dataset()
returns = preprocessing.select_most_traded_stocks(dataset)
data = preprocessing.create_sliding_window(returns, T=52)

#Create an instance of the LSTM_model class
lstm_model = LSTM_model(data)

#Prepare the input data for training
X_train, y_train, X_test, y_test = lstm_model.prepare_data()

#reate the LSTM model architecture
lstm_model.create_model()

#Train the model
training = lstm_model.train_model(X_train, y_train, X_test, y_test)

#Evaluate the model and get predictions on the test dataset
test_predict = lstm_model.evaluate_model(X_test, y_test)

2023-04-06 20:22:49.272462: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-06 20:22:49.273532: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-06 20:22:49.275626: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Returns (InputLayer)           [(None, 52, 1)]      0           []                               
                                                                                                  
 Tickers (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 LSTM1 (LSTM)                   (None, 52, 25)       2700        ['Returns[0][0]']                
                                                                                                  
 embedding (Embedding)          (None, 1, 5)         2105        ['Tickers[0][0]']                
                                                                                              

2023-04-06 20:22:49.832717: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-06 20:22:49.834017: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-06 20:22:49.835308: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



KeyboardInterrupt: 

In [None]:
from azureml.core import Workspace, Experiment, Environment, ScriptRunConfig, Dataset
from azureml.core.compute import ComputeTarget

# Load the workspace
workspace = Workspace.from_config()

# Get the compute target
compute_target = ComputeTarget(workspace=workspace, name='your-compute-cluster')

# Get the dataset
dataset = Dataset.get_by_name(workspace, 'your-dataset-name')

# Define the training environment
env = Environment('lstm-env')
env.python.conda_dependencies.add_pip_package('tensorflow')
env.python.conda_dependencies.add_pip_package('pandas')
env.python.conda_dependencies.add_pip_package('numpy')
env.python.conda_dependencies.add_pip_package('h5py')

# Define the script run configuration
src = ScriptRunConfig(source_directory='path-to-training-script',
                      script='train.py',
                      compute_target=compute_target,
                      environment=env,
                      arguments=['--data-path', dataset.as_named_input('input').as_mount()])

# Submit the experiment
experiment = Experiment(workspace=workspace, name='lstm-training')
run = experiment.submit(src)
run.wait_for_completion(show_output=True)

In [14]:

from azure.storage.blob import BlobServiceClient

storage_account_name = "atsmain"
storage_account_key = "eFG+qDGbyl8dzRAXJeOlaqAZgOvNGwpennSLCnqgFT91y83dBfcUwcEjPS3FJgujHtazEWbylIb++AStIdsfbw=="
container_name = "test"
local_file_path = '/home/groovyjac/projects/autonomous-portfolio-management/main_data_store_JDKv1.h5'
blob_file_name = "main_data_store_JDKv1.h5"

# Create a BlobServiceClient
connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={storage_account_key};EndpointSuffix=core.windows.net"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Get the container client
container_client = blob_service_client.get_container_client(container_name)

# Upload the dataset file to the Blob Storage
with open(local_file_path, "rb") as data:
    container_client.upload_blob(blob_file_name, data, overwrite=True)

KeyboardInterrupt: 

In [16]:
from azureml.core import Workspace, Datastore, Dataset

# Replace these variables with your own values
subscription_id = "3bc70aff-a4bc-47bc-9782-28269ac52dfa"
resource_group = "autonomous-portfolio-management"
workspace_name = "ats-lstm"

# Load your workspace
workspace = Workspace(subscription_id, resource_group, workspace_name)

# Get the datastore
datastore = Datastore.get(workspace, storage_account_name)

# Register the dataset
datastore_path = [(datastore, blob_file_name)]
dataset = Dataset.File.from_files(datastore_path)
dataset = dataset.register(workspace, 'main_data_store_JDKv1', create_new_version=True)

NotImplementedError: Linux distribution ubuntu 22.04 does not have automatic support. 
Missing packages: {'liblttng-ust.so.0'}
.NET Core 3.1 can still be used via `dotnetcore2` if the required dependencies are installed.
Visit https://aka.ms/dotnet-install-linux for Linux distro specific .NET Core install instructions.
Follow your distro specific instructions to install `dotnet-runtime-*` and replace `*` with `3.1.23`.
