In [1]:
import pandas as pd
import numpy as np
import librosa

# Read the Feather file
df = pd.read_feather('CRSP_daily_data_for_project(Technical_Analysis).feather')
df = df[(df['PERMNO']==10000) |( df['PERMNO']==10001)|(df['PERMNO']==10002)| (df['PERMNO']==93434)|(df['PERMNO']==93435)|(df['PERMNO']==93436)]


In [2]:

# Get the list of all stock codes
stock_codes = df['PERMNO'].unique()

cqt_window_size = 30
n_bins = 20  # Number of CQT bins

# Get all unique dates
unique_dates = df['date'].unique()

# Create the 4D array, initialized with NaN
num_dates = len(unique_dates)
max_shape = (len(stock_codes), num_dates, cqt_window_size, n_bins)
final_4d_array = np.full(max_shape, np.nan)

# Create the date-to-index mapping
date_to_index = {date: idx for idx, date in enumerate(unique_dates)}

# Create the stock code-to-index mapping
stock_code_to_index = {stock_code: idx for idx, stock_code in enumerate(stock_codes)}



In [3]:
# Function: Generate CQT data for each stock
def create_cqt_df(stock_df):
    stock_returns = stock_df['vwretx'].values

    # Data preprocessing: Ensure all values are finite
    stock_returns = np.nan_to_num(stock_returns, nan=0.0, posinf=0.0, neginf=0.0)

    sr = 1
    fmin = 0.01  # Minimum frequency
    hop_length = 1
    cqt_result = librosa.cqt(stock_returns, n_bins=n_bins, sr=sr, hop_length=hop_length, fmin=fmin)
    cqt_result_db = librosa.amplitude_to_db(np.abs(cqt_result), ref=np.mean)

    cqt_df = pd.DataFrame(cqt_result_db.T, columns=[f'CQT_{i+1}' for i in range(cqt_result_db.shape[0])])
    cqt_df = cqt_df.loc[:len(stock_df)-1, :]  # Ensure the length matches the original data
    cqt = pd.concat([stock_df['date'], cqt_df], axis=1)
    return cqt

# Function: Create rolling windows and retain date indices
def create_rolling_windows_with_dates(data, cqt_window_size):
    windows = []
    date_indices = []
    for i in range(len(data) - cqt_window_size + 1):
        window = data.iloc[i:i + cqt_window_size, 1:].values  # Exclude the date column, keep only CQT data
        windows.append(window)
        start_date = data.iloc[i]['date']
        if start_date in date_to_index:
            date_indices.append(date_to_index[start_date])
    return np.array(windows), date_indices


In [6]:

# Iterate over each stock and fill the 4D array
for stock_code in stock_codes:
    stock_idx = stock_code_to_index[stock_code]
    stock_data = df[df['PERMNO'] == stock_code].reset_index(drop=True)
    
    # Generate CQT data
    cqt_data = create_cqt_df(stock_data)

    # Create rolling windows and retain date indices
    rolling_windows_3d, date_indices = create_rolling_windows_with_dates(cqt_data, cqt_window_size)
    
    # Print rolling windows shape and date indices

    
    # Fill the 4D array
    for window_idx, (window_data, date_idx) in enumerate(zip(rolling_windows_3d, date_indices)):
        if window_data.shape == (cqt_window_size, n_bins):
            final_4d_array[stock_idx, date_idx, :, :] = window_data



final_4d_array.shape



(6, 9573, 30, 20)