In [1]:
import numpy as np
import pandas as pd

In [2]:
def create_lookback_windows(df, lookback=12, feature_cols=None):
    """
    For each stock (grouped by permno), create a lookback window of feature data.
    The lookback window is created from the raw feature data (i.e. before any padding). After building
    the window, if the current row's date is before the company's 'entry_year', we replace the
    lookback window with zeros.
    
    Only rows with a full lookback window (i.e. at least 'lookback' months of history) are kept.
    
    Parameters:
      df (pd.DataFrame): DataFrame that must include at least the columns: 'permno', 'date', and 'entry_year'.
      lookback (int): The number of months to include in each window.
      feature_cols (list): List of columns to include in the window. If None, defaults to all columns 
                           except ['permno', 'date', 'entry_year'].
    
    Returns:
      pd.DataFrame: A new DataFrame (sorted by permno and date) that includes only rows
                    with a complete lookback window and an additional column "lookback"
                    containing the 2D numpy array of historical features.
    """
    if feature_cols is None:
        feature_cols = [col for col in df.columns if col not in ['permno', 'date', 'entry_year']]
    
    # Ensure that the date column is in datetime format.
    df['date'] = pd.to_datetime(df['date'])
    
    # Sort by permno and date.
    df_sorted = df.sort_values(by=['permno', 'date']).copy()
    results = []
    
    n_features = len(feature_cols)
    
    # Process each stock separately.
    for permno, group in df_sorted.groupby('permno'):
        # Sort by date and reset the index for proper slicing.
        group = group.sort_values('date').reset_index(drop=True)
        
        # If this stock doesn't have enough rows for a full lookback window, skip it.
        if len(group) < lookback:
            continue
        
        # Extract the feature matrix (rows: months, columns: features).
        features_matrix = group[feature_cols].values
        n_rows = features_matrix.shape[0]
        
        # Only start at the first index where a full lookback window exists.
        for i in range(lookback - 1, n_rows):
            # Build the window from (i - lookback + 1) to i (inclusive).
            window = features_matrix[i - lookback + 1 : i + 1, :].copy()
            
            # Copy the current row data.
            row_data = group.iloc[i].copy()
            
            # If the row's date is before the company's entry year,
            # override the lookback window with zeros.
            # (Assuming entry_year is stored as a numeric year.)
            if row_data['date'].year < row_data['entry_year']:
                window = np.zeros((lookback, n_features))
            
            row_data['lookback'] = window
            results.append(row_data)
    
    new_df = pd.DataFrame(results)
    return new_df

def mask_entries_before_entry_year(df, lookback_col='lookback', current_feature_cols=None):
    """
    For each row in the DataFrame, if the row's date is before the beginning of its entry year,
    then the function replaces the lookback window with an array of zeros (of the same shape) and
    sets the current feature columns for that row to 0.
    
    Parameters:
      df (pd.DataFrame): A DataFrame that must include at least the columns 'date', 'entry_year',
                         and a column (by default named 'lookback') containing the lookback window (a 2D NumPy array).
      lookback_col (str): The name of the column that contains the lookback window.
      current_feature_cols (list or None): List of columns to be masked (set to 0) for the current features.
                                           If None, it defaults to all columns except ['permno', 'date',
                                           'entry_year', lookback_col].
    
    Returns:
      pd.DataFrame: A new DataFrame (a copy of the original) with the masked lookback windows and feature columns.
    """
    # If no list of feature columns is provided, assume that all columns except the identifiers
    # and the lookback window column are current features.
    if current_feature_cols is None:
        current_feature_cols = [col for col in df.columns if col not in ['permno', 'date', 'entry_year', lookback_col]]
    
    # Work on a copy to avoid modifying the original DataFrame.
    df = df.copy()
    
    # Ensure that the 'date' column is of datetime type.
    df['date'] = pd.to_datetime(df['date'])
    
    def mask_row(row):
        # Construct the entry-year start date (January 1 of the entry_year).
        entry_start = pd.Timestamp(year=int(row['entry_year']), month=1, day=1)
        
        # If the row's date is before the entry start date,
        # replace the lookback window and the current feature columns with zeros.
        if row['date'] < entry_start:
            # Replace the lookback window with zeros (if it is an ndarray).
            if isinstance(row[lookback_col], np.ndarray):
                row[lookback_col] = np.zeros_like(row[lookback_col])
            # Otherwise, leave it unchanged.
            
            # Set each of the current feature columns to 0.
            for col in current_feature_cols:
                row[col] = 0
        
        return row
    
    # Apply the masking function row by row.
    df = df.apply(mask_row, axis=1)
    
    return df

def df_to_3d_array(df, lookback_col='lookback'):
    """
    Convert a DataFrame that contains a lookback window column (each being a 2D NumPy array)
    into a 3-dimensional NumPy array.

    Parameters:
      df (pd.DataFrame): DataFrame that must include a column (by default 'lookback') where each
                         entry is a 2D NumPy array (with shape (lookback, n_features)).
      lookback_col (str): Name of the column that contains the lookback window arrays.

    Returns:
      np.ndarray: A 3D NumPy array with shape (n_rows, lookback, n_features)
    """
    # Extract the lookback windows as a list
    windows_list = df[lookback_col].tolist()

    # Optionally, you can add a check to ensure each entry is a 2D NumPy array.
    for i, window in enumerate(windows_list):
        if not isinstance(window, np.ndarray):
            raise ValueError(f"Row {i} in column '{lookback_col}' is not a numpy array.")
        if window.ndim != 2:
            raise ValueError(f"Row {i} in column '{lookback_col}' does not have 2 dimensions.")

    # Stack the list into a 3D numpy array.
    array_3d = np.stack(windows_list, axis=0)
    return array_3d


In [3]:
# Read Data
df = pd.read_csv('monthly_top50_same_range_carried_forward.csv')

# Ensure 'date' is in datetime format.
df['date'] = pd.to_datetime(df['date'])

# Define which columns are features.
feature_cols = [col for col in df.columns if col not in ['permno', 'date', 'entry_year']]

# Create the lookback windows (each "lookback" entry will be an array of shape (12, n_features)).
df_with_lookback = create_lookback_windows(df, lookback=12, feature_cols=feature_cols)

# Create padding for the the dataframe. Mask entries where the date is before the entry year.
df_padded = mask_entries_before_entry_year(df_with_lookback, lookback_col='lookback', current_feature_cols=feature_cols)

# Convert to a 3D NumPy array.
data = df_to_3d_array(df_padded, lookback_col='lookback')
np.save('final_data.npy', data)

In [4]:
data.shape

(3381, 12, 22)