In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import logging
import wrds
import math
import gym
from gym import spaces
from torch.optim.optimizer import Optimizer

# To this:
import torch.optim as optim

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

db = wrds.Connection()

class AlphaPortfolioData(Dataset):
    def __init__(self, start_year=2014, end_year=2020, final_year=2016, lookback=12, G=2):
        super().__init__()
        self.lookback = lookback
        self.G = G
        self.merged, self.final_data = self._load_wrds_data(start_year, end_year, final_year)
        self.unique_permnos = sorted(self.final_data['permno'].unique())
        self.global_max_assets = len(self.unique_permnos)
        self.permno_to_idx = {permno: idx for idx, permno in enumerate(self.unique_permnos)}
        self.sequences, self.future_returns, self.masks = self._create_sequences()

    def _load_wrds_data(self, start_year, end_year, final_year):

        permno_list = []
        combined_data = pd.DataFrame()

        for year in range(start_year, end_year+1):
            
            start_date = f'{year}-01-01'
            end_date = f'{year}-12-31'
            
            crsp_query = f"""
                SELECT a.permno, a.date, a.ret, a.prc, a.shrout, 
                    a.vol, a.cfacshr, a.altprc, a.retx
                FROM crsp.msf AS a
                WHERE a.date BETWEEN '{start_date}' AND '{end_date}'
                AND a.permno IN (
                    SELECT permno FROM crsp.msenames 
                    WHERE exchcd BETWEEN 1 AND 3  
                        AND shrcd IN (10, 11)       
                    )
                """
            crsp_data = db.raw_sql(crsp_query)

            query_ticker = """
                SELECT permno, namedt, nameenddt, ticker
                FROM crsp.stocknames
            """
            
            stocknames = db.raw_sql(query_ticker)
            crsp_data = crsp_data.merge(stocknames.drop_duplicates(subset=['permno']), on='permno', how='left')
            crsp_data = crsp_data.dropna(subset=['ticker'])

            crsp_data['mktcap'] = (crsp_data['prc'].abs() * crsp_data['shrout'] * 1000) / 1e6  # In millions
            crsp_data['year'] = pd.to_datetime(crsp_data['date']).dt.year
            crsp_data = crsp_data.dropna(subset=['mktcap'])
            
            top_50_permnos_by_year = crsp_data.groupby('permno')['mktcap'].agg(['max']).reset_index().sort_values(by='max', ascending=False).head(50)['permno'].unique()
            permno_list.extend(top_50_permnos_by_year)
            
            combined_data = pd.concat([combined_data, crsp_data[crsp_data['permno'].isin(permno_list)]], axis=0)

        combined_data = combined_data[['permno', 'ticker', 'date', 'ret', 'prc', 'shrout', 'vol', 'mktcap', 'year']]
        combined_data['date'] = pd.to_datetime(combined_data['date'])

        start_date = f'{start_year}-01-01'
        end_date = f'{end_year}-12-31'

        # Query Compustat quarterly data with release dates (rdq)
        fund_query = f"""
            SELECT gvkey, datadate, rdq, saleq
            FROM comp.fundq
            WHERE indfmt = 'INDL' AND datafmt = 'STD' AND popsrc = 'D' AND consol = 'C'
            AND datadate BETWEEN '{start_date}' AND '{end_date}'
            AND rdq IS NOT NULL
        """
        fund = db.raw_sql(fund_query)
        fund['rdq'] = pd.to_datetime(fund['rdq'])
        fund['datadate'] = pd.to_datetime(fund['datadate'])

        # Link Compustat GVKEY to CRSP PERMNO
        link_query = """
            SELECT lpermno AS permno, gvkey, linkdt, linkenddt
            FROM crsp.ccmxpf_linktable
            WHERE linktype IN ('LU', 'LC') AND linkprim IN ('P', 'C')
        """
        link = db.raw_sql(link_query)
        fund = pd.merge(fund, link, on='gvkey', how='left')
        fund = fund.dropna(subset=['permno'])

        # Sort both datasets by date
        combined_data_sorted = combined_data.sort_values('date')
        fund_sorted = fund.sort_values('rdq')
        fund_sorted['permno'] = fund_sorted['permno'].astype(int)

        merged = pd.merge_asof(
            combined_data_sorted,
            fund_sorted,
            left_on='date',
            right_on='rdq',
            by='permno',
            direction='backward'
        )
        # merged = merged.dropna(subset=['rdq', 'ticker'])
        merged = merged.sort_values(by='date')
        merged = merged[['permno', 'ticker', 'date', 'ret', 'prc','vol', 'mktcap', 'gvkey', 'rdq', 'saleq']]
        merged = merged.ffill()

        unique_dates = merged['date'].unique()
        date_mapping = {date: i for i, date in enumerate(sorted(unique_dates))}
        merged['date_mapped'] = merged['date'].map(date_mapping)

        merged['year'] = pd.to_datetime(merged['date']).dt.year
        final_data = merged[merged['year'] >= final_year]

        
        return merged, final_data


    def _create_sequences(self):
        data = self.final_data
        lookback = self.lookback
        unique_dates = pd.to_datetime(data['date'].unique())
        unique_dates_sorted = np.sort(unique_dates)
        num_features = 6  # Based on []'permno', 'ret', 'prc', 'vol', 'mktcap', 'saleq']

        sequences = []
        future_returns = []
        masks = []

        for start_idx in tqdm(range(len(unique_dates_sorted) - 2 * lookback+1)):
            hist_start = unique_dates_sorted[start_idx]
            hist_end = unique_dates_sorted[start_idx + lookback - 1]
            future_start = unique_dates_sorted[start_idx + lookback]
            future_end = unique_dates_sorted[start_idx + 2 * lookback-1]

            print(f'Hist start: {hist_start}, Hist end: {hist_end}, Future start: {future_start}, Future end: {future_end}')

            # Initialize batch arrays with zeros
            batch_features = np.zeros((self.global_max_assets, lookback, num_features))
            batch_returns = np.zeros((self.global_max_assets, lookback))
            batch_mask = np.zeros(self.global_max_assets, dtype=bool)

            for permno in self.unique_permnos:
                idx = self.permno_to_idx[permno]

                # Historical data for the current window
                hist_data = data[
                    (data['permno'] == permno) &
                    (data['date'] >= hist_start) &
                    (data['date'] <= hist_end)
                ].sort_values('date')

                # Future returns for the next window
                future_data = data[
                    (data['permno'] == permno) &
                    (data['date'] >= future_start) &
                    (data['date'] <= future_end)
                ]['ret'].values

                # Check if both periods have complete data
                if len(hist_data) == lookback and len(future_data) == lookback:
                    features = hist_data[['permno', 'ret', 'prc', 'vol', 'mktcap', 'saleq']].values
                    batch_features[idx] = features
                    batch_returns[idx] = future_data
                    batch_mask[idx] = True

            sequences.append(batch_features)
            future_returns.append(batch_returns)
            masks.append(batch_mask)

        # Convert to tensors
        sequences_tensor = torch.tensor(np.array(sequences), dtype=torch.float32)
        future_returns_tensor = torch.tensor(np.array(future_returns), dtype=torch.float32)
        masks_tensor = torch.tensor(np.array(masks), dtype=torch.bool)

        return sequences_tensor, future_returns_tensor, masks_tensor

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.future_returns[idx], self.masks[idx]

WRDS recommends setting up a .pgpass file.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [40]:
# ---------------------------
# Positional Encoding Module
# ---------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        """
        Creates sinusoidal positional encodings.
        Args:
            d_model (int): The dimensionality of the embeddings.
            dropout (float): Dropout probability.
            max_len (int): Maximum expected sequence length.
        """
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)  # (max_len, 1, d_model)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        # x: (seq_len, batch_size, d_model)
        seq_len = x.size(0)
        x = x + self.pe[:seq_len, :]
        return self.dropout(x)

# -----------------------------------------
# Transformer Encoder–Based AlphaPortfolio Model
# -----------------------------------------
class TransformerAlphaPortfolio(nn.Module):
    def __init__(self, num_features, lookback = 12, d_model=256, nhead=4, num_layers=2, output_dim=1, dropout=0.1):
        """
        Args:
            num_features (int): Number of input features per time step (e.g., 6).
            d_model (int): The dimension of the embedding (model dimension).
            nhead (int): Number of heads in the multihead attention mechanism.
            num_layers (int): Number of Transformer encoder layers.
            output_dim (int): Dimension of the model's final output per asset.
            dropout (float): Dropout probability.
        """
        super().__init__()
        # 1. Embed the raw features into d_model dimensions.
        self.embedding = nn.Linear(num_features, d_model)
        
        # 2. Positional encoding is applied along the time (lookback) dimension.
        self.pos_encoder = PositionalEncoding(d_model, dropout=dropout)
        
        # 3. Transformer encoder layers.
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=nhead, 
            dim_feedforward=4 * d_model,
            dropout=dropout
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.d_model = d_model
        self.fc = nn.Linear(lookback * d_model, output_dim)

    
    def forward(self, x, asset_mask=None, time_mask=None):
        """
        Args:
            x: Tensor of shape (batch_size, global_max_assets, lookback, num_features).
            asset_mask (optional): Bool tensor of shape (batch_size, global_max_assets) 
                                indicating valid assets. (True = valid)
            time_mask (optional): Bool tensor of shape (batch_size * global_max_assets, lookback) 
                                to mask time steps (True = "should be ignored/padded").
            
        Returns:
            out: Tensor of shape (batch_size, global_max_assets, output_dim).
        """
        batch_size, num_assets, lookback, num_features = x.size()
        
        # 1) Linear embed the raw features: (B, N, L, F) -> (B, N, L, d_model)
        x = self.embedding(x)
        
        # 2) Flatten batch & asset for the Transformer:
        #    shape -> (B*N, L, d_model)
        x = x.view(batch_size * num_assets, lookback, self.d_model)
        
        # 3) Permute for Transformer input: (L, B*N, d_model)
        x = x.transpose(0, 1)
        
        # 4) Positional encoding
        x = self.pos_encoder(x)
        
        # 5) Pass through Transformer encoder.
        #    If you have a time mask, shape should be (B*N, L).
        #    True in the mask means "ignore this time step."
        encoded_seq = self.transformer_encoder(x, src_key_padding_mask=time_mask)
        # encoded_seq shape: (L, B*N, d_model)
        
        # 6) Concatenate all time-step embeddings:
        #    (L, B*N, d_model) -> (B*N, L, d_model) -> (B*N, L*d_model)
        encoded_seq = encoded_seq.transpose(0, 1)  # (B*N, L, d_model)
        asset_repr = encoded_seq.reshape(batch_size * num_assets, lookback * self.d_model)
        
        # 7) Final linear layer -> (B*N, output_dim)
        #    NOTE: self.fc must be defined as: nn.Linear(lookback * d_model, output_dim)
        out = self.fc(asset_repr)
        
        # 8) Reshape back to (batch_size, num_assets, output_dim)
        out = out.view(batch_size, num_assets, -1)
        
        # 9) Optionally zero out invalid assets
        if asset_mask is not None:
            # asset_mask: (B, N), True = valid. 
            # We want to keep outputs only for valid assets, so multiply by float mask.
            out = out * asset_mask.unsqueeze(-1).float()
        
        return out

In [None]:
# Get Data
data = AlphaPortfolioData(start_year=2014, end_year=2020, final_year=2016, lookback=12, G=2)

In [11]:
sequences = data.sequences
future_returns = data.future_returns
masks = data.masks

In [None]:
sequences.shape, future_returns.shape, masks.shape

(torch.Size([37, 69, 12, 6]), torch.Size([37, 69, 12]), torch.Size([37, 69]))

In [41]:
batch_size = sequences.size(0)
global_max_assets = data.global_max_assets
lookback = data.lookback
num_features = sequences.size(-1)
output_dim = 1
d_model = 256
nhead = 4
num_layers = 1
dropout = 0.2

model = TransformerAlphaPortfolio(num_features, lookback, d_model, nhead, num_layers, output_dim, dropout)
te_output = model(sequences, asset_mask = masks, time_mask=None)



In [None]:
te_output.shape

tensor([[[-0.6860],
         [-1.1090],
         [-0.6875],
         ...,
         [-0.4040],
         [-0.7860],
         [-0.0000]],

        [[-1.1120],
         [-1.1664],
         [-1.2521],
         ...,
         [-0.3522],
         [-0.6852],
         [-0.0000]],

        [[-1.1199],
         [-0.5493],
         [-1.3024],
         ...,
         [-0.4405],
         [-0.5125],
         [-0.0000]],

        ...,

        [[-0.7586],
         [-0.4089],
         [-0.0000],
         ...,
         [-1.1593],
         [-0.4148],
         [-0.0000]],

        [[-0.6823],
         [-0.4833],
         [ 0.0000],
         ...,
         [-0.8301],
         [-0.7472],
         [-0.0000]],

        [[-1.4638],
         [-0.8607],
         [ 0.0000],
         ...,
         [-0.3784],
         [-0.7661],
         [ 0.0000]]], grad_fn=<MulBackward0>)