In [3]:
!pip install librosa

Collecting librosa
  Downloading librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Using cached numba-0.59.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl.metadata (14 kB)
Collecting pooch>=1.1 (from librosa)
  Using cached pooch-1.8.1-py3-none-any.whl.metadata (9.5 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.3.7-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.5 kB)
Collecting lazy-loader>=0.1 (from librosa)
  Using cached lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpack-1.0.8-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.1 kB)
Collecting llvmlite<0.43,>=0.42.0dev0 (from numba>=0.51.0->librosa)
  Using cached llvmlite-0.42.0-cp312-cp312-mac

In [1]:
import pandas as pd
import numpy as np
import librosa

# Read the Feather file
df = pd.read_feather('CRSP_daily_data_for_project(Technical_Analysis).feather')
df = df[(df['PERMNO']==10000) |( df['PERMNO']==10001)|(df['PERMNO']==10002)| (df['PERMNO']==93434)|(df['PERMNO']==93435)|(df['PERMNO']==93436)]


In [2]:
df[df['PERMNO']==10000]

Unnamed: 0,PERMNO,date,Open,High,Low,Close,Volume,SHROUT,vwretx,ewretx,sprtrn
1,10000,1986-01-07,,2.750,2.3750,2.56250,1000.0,3680.0,0.013800,0.011046,0.014954
2,10000,1986-01-08,,2.625,2.3750,2.50000,12800.0,3680.0,-0.020750,-0.005135,-0.027268
3,10000,1986-01-09,,2.625,2.3750,2.50000,1400.0,3680.0,-0.011315,-0.011659,-0.008944
4,10000,1986-01-10,,2.625,2.3750,2.50000,8500.0,3680.0,0.000047,0.003632,-0.000728
5,10000,1986-01-13,,2.750,2.5000,2.62500,5450.0,3680.0,0.002680,0.002369,0.003690
...,...,...,...,...,...,...,...,...,...,...,...
359,10000,1987-06-08,,0.250,0.1875,0.21875,0.0,3893.0,0.008563,0.001508,0.011143
360,10000,1987-06-09,,0.250,0.1875,0.21875,0.0,3893.0,0.001918,0.002217,0.001887
361,10000,1987-06-10,,0.250,0.1875,0.21875,0.0,3893.0,0.001492,0.001049,0.000639
362,10000,1987-06-11,,0.250,0.1875,0.21875,500.0,3893.0,0.003427,0.002650,0.004236


In [3]:

# Get the list of all stock codes
stock_codes = df['PERMNO'].unique()

cqt_window_size = 30
n_bins = 20  # Number of CQT bins

# Get all unique dates
unique_dates = df['date'].unique()

# Create the 4D array, initialized with NaN
num_dates = len(unique_dates)
max_shape = (len(stock_codes), num_dates, cqt_window_size, n_bins)
final_4d_array = np.full(max_shape, np.nan)

# Create the date-to-index mapping
date_to_index = {date: idx for idx, date in enumerate(unique_dates)}

# Create the stock code-to-index mapping
stock_code_to_index = {stock_code: idx for idx, stock_code in enumerate(stock_codes)}



In [4]:
# Function: Generate CQT data for each stock
def create_cqt_df(stock_df):
    stock_returns = stock_df['vwretx'].values

    # Data preprocessing: Ensure all values are finite
    stock_returns = np.nan_to_num(stock_returns, nan=0.0, posinf=0.0, neginf=0.0)

    sr = 1
    fmin = 0.01  # Minimum frequency
    hop_length = 1
    cqt_result = librosa.cqt(stock_returns, n_bins=n_bins, sr=sr, hop_length=hop_length, fmin=fmin)
    cqt_result_db = librosa.amplitude_to_db(np.abs(cqt_result), ref=np.mean)

    cqt_df = pd.DataFrame(cqt_result_db.T, columns=[f'CQT_{i+1}' for i in range(cqt_result_db.shape[0])])
    cqt_df = cqt_df.loc[:len(stock_df)-1, :]  # Ensure the length matches the original data
    cqt = pd.concat([stock_df['date'], cqt_df], axis=1)
    return cqt

# Function: Create rolling windows and retain date indices
def create_rolling_windows_with_dates(data, cqt_window_size):
    windows = []
    date_indices = []
    for i in range(len(data) - cqt_window_size + 1):
        window = data.iloc[i:i + cqt_window_size, 1:].values  # Exclude the date column, keep only CQT data
        windows.append(window)
        start_date = data.iloc[i]['date']
        if start_date in date_to_index:
            date_indices.append(date_to_index[start_date])
    return np.array(windows), date_indices


In [5]:

# Iterate over each stock and fill the 4D array
for stock_code in stock_codes:
    stock_idx = stock_code_to_index[stock_code]
    stock_data = df[df['PERMNO'] == stock_code].reset_index(drop=True)
    
    # Generate CQT data
    cqt_data = create_cqt_df(stock_data)

    # Create rolling windows and retain date indices
    rolling_windows_3d, date_indices = create_rolling_windows_with_dates(cqt_data, cqt_window_size)
    
    # Print rolling windows shape and date indices

    
    # Fill the 4D array
    for window_idx, (window_data, date_idx) in enumerate(zip(rolling_windows_3d, date_indices)):
        if window_data.shape == (cqt_window_size, n_bins):
            final_4d_array[stock_idx, date_idx, :, :] = window_data



final_4d_array.shape



(6, 9573, 30, 20)

In [6]:
stock_codes

array([10000, 10001, 10002, 93434, 93435, 93436])

In [7]:
df[(df['PERMNO']==10000)].shape

(363, 11)

In [8]:
ret = df[(df['PERMNO']==10000)]['Close'].pct_change()

  ret = df[(df['PERMNO']==10000)]['Close'].pct_change()


In [9]:
# Extract the CQT data for the first stock without NaN values
stock_data = final_4d_array[0,:334]
stock_data

array([[[ -7.23281952, -25.83381724,  -5.11713754, ...,  -5.64655261,
          -9.02919624,  -5.23870194],
        [ -7.22033517, -25.78717007,  -5.10347597, ...,  -5.53454498,
          -8.86783491,  -5.13329579],
        [ -7.20787396, -25.73651811,  -5.0898552 , ...,  -5.42318056,
          -8.70785397,  -5.02865711],
        ...,
        [ -6.90428842, -23.59520902,  -4.76288093, ...,  -2.86606545,
          -5.14175586,  -2.6593852 ],
        [ -6.89247959, -23.49095546,  -4.75035481, ...,  -2.77318641,
          -5.01535421,  -2.57426256],
        [ -6.88069777, -23.38629951,  -4.73787204, ...,  -2.68103438,
          -4.89010123,  -2.48985862]],

       [[ -7.22033517, -25.78717007,  -5.10347597, ...,  -5.53454498,
          -8.86783491,  -5.13329579],
        [ -7.20787396, -25.73651811,  -5.0898552 , ...,  -5.42318056,
          -8.70785397,  -5.02865711],
        [ -7.19543603, -25.68198863,  -5.07627533, ...,  -5.31246923,
          -8.54925428,  -4.92478595],
        ...,


In [38]:
!pip install keras tensorflow

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow)
  Using cached flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.5.4-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting ml-dtypes (from keras)
  Downloading ml_dtypes-0.3.2-cp312-cp312-macosx_10_9_universal2.whl.metadata (20 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21

In [10]:
# Apply LSTM model to the CQT data
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Normalize the data
scaler = MinMaxScaler()
stock_data_normalized = scaler.fit_transform(stock_data.reshape(stock_data.shape[0], stock_data.shape[1] * stock_data.shape[2]))

X, y = np.array(stock_data_normalized), np.array(ret[-334:])
# X = X.reshape(X.shape[0], 1, X.shape[1])


In [11]:
stock_data.shape

(334, 30, 20)

In [12]:
stock_data.reshape(stock_data.shape[0], 1, stock_data.shape[1] * stock_data.shape[2]).shape

(334, 1, 600)

In [13]:
stock_data_normalized.shape

(334, 600)

In [14]:
X.shape

(334, 600)

In [15]:
y.shape

(334,)

In [18]:
split_index = int(len(X)*0.95)
split_index

317

In [19]:
X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((317, 600), (17, 600), (317,), (17,))

In [20]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X.shape[1], 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')


In [21]:
# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1)

# Evaluate the model
model.evaluate(X_test, y_test)


Epoch 1/100


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 132ms/step - loss: 0.0033
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step - loss: 0.0019
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step - loss: 0.0028
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 116ms/step - loss: 0.0030
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 122ms/step - loss: 0.0021
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step - loss: 0.0027
Epoch 7/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 103ms/step - loss: 0.0025
Epoch 8/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 122ms/step - loss: 0.0020
Epoch 9/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step - loss: 0.0020
Epoch 10/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 140ms/step - loss

0.00011780263594118878