In [16]:
import numpy as np
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl

In [17]:
def transform_probas(probas_set: list):

    features = []
    for doc_idx in np.arange(probas_set[0].shape[0]):
        features.append(
            np.hstack([ probas_set[clf_idx][doc_idx] for clf_idx in np.arange(len(probas_set)) ])
        )
            
    return np.vstack(features)

def load_probs_fold(dataset: str, clfs: list, fold: int):

    clfs_probs_train = []
    clfs_probs_test = []
    for clf in clfs:
        probs_dir = f"/home/welton/data/normal_probas/split_10/{dataset}/10_folds/{clf}/{fold}/"
        clfs_probs_train.append(np.load(f"{probs_dir}/train.npz")["X_train"])
        clfs_probs_test.append(np.load(f"{probs_dir}/test.npz")["X_test"])
    
    return transform_probas(clfs_probs_train), transform_probas(clfs_probs_test)

def load_labels_fold(dataset: str, fold: int):

    y_train = np.load(f"/home/welton/data/datasets/labels/split_10/{dataset}/{fold}/train.npy")
    y_test = np.load(f"/home/welton/data/datasets/labels/split_10/{dataset}/{fold}/test.npy")
    return y_train, y_test


In [18]:
class StackingDataset(Dataset):

    def __init__(self, X, y) -> None:
        super().__init__()

        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return { "x": torch.tensor(self.X[idx]), 'y': torch.tensor(self.y[idx])  }

class MultiHeadAttentionEncoder(pl.LightningModule):

    def __init__(self, hidden, num_heads, dropout):
        super(MultiHeadAttentionEncoder, self).__init__()

        self.key_fnn = nn.Linear(hidden, hidden)
        self.query_fnn = nn.Linear(hidden, hidden)
        self.value_fnn = nn.Linear(hidden, hidden)

        self.multihead_att = torch.nn.MultiheadAttention(
            embed_dim=hidden,
            num_heads=num_heads,
            dropout=dropout
        )

    def forward(self, x):

        x = torch.transpose(x, 0, 1)
        key = self.key_fnn(x)
        query = self.query_fnn(x)
        value = self.value_fnn(x)
        attn_output, _ = self.multihead_att(query, key, value)

        return attn_output.flatten(-1)
    
class MetaLayer(pl.LightningModule):

    def __init__(self, classes_number, clfs_number, num_heads, dropout):

        super().__init__()
        
        self.encoder = MultiHeadAttentionEncoder(
            classes_number,
            num_heads,
            dropout)

        self.output_layer = nn.Linear(classes_number * clfs_number, classes_number)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):

        x = self.output_layer(x)
        return nn.Softmax(x)

    def configure_optimizers(self):

        return torch.optim.AdamW(self.parameters(), lr=1e-3)
        
    def training_step(self, batch):

        x, y = batch
        # forward pass.
        y_hat = self.forward(x)
        # computing loss.
        loss = self.criterion(y_hat, y)
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss

    
    def validation_step(self, batch):
        
        print(batch, batch.shape)
        x, y = batch
        # forward pass.
        y_hat = self.forward(x)
        # computing loss.
        loss  = self.criterion(y_hat, y)
        self.log_dict({'val_loss': loss}, on_step=False, on_epoch=True, prog_bar=True, logger=True)

        return loss

In [4]:
labels_train, y_test = load_labels_fold("20ng", 0)

In [5]:
probas_train, X_test = load_probs_fold("20ng", ["bert", "xlnet", "ktmk"], 0)

In [6]:
type(probas_train[0])

numpy.ndarray

In [7]:
probas_train.shape

(16954, 60)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(probas_train, labels_train, stratify=labels_train, test_size=0.1, random_state=42)

In [15]:
np.savez("data/data.npz", X_train=X_train, X_test=X_test, X_val=X_val, y_train=y_train, y_test=y_test, y_val=y_val)

In [9]:
X_train.shape

(15258, 60)

In [10]:
clfs_number = 3
classes_number = 20
max_epochs = 30
batch_size = 32

In [11]:
train_loader = DataLoader(dataset = StackingDataset(X_train, y_train), batch_size=batch_size)
val_loader = DataLoader(dataset = StackingDataset(X_val, y_val), batch_size=batch_size)
test_loader = DataLoader(dataset = StackingDataset(X_test, y_test), batch_size=batch_size)

In [12]:
for batch in train_loader:
    print(batch['x'].shape, batch['y'].shape)
    break

torch.Size([32, 60]) torch.Size([32])


In [13]:
meta_layer = MetaLayer(classes_number, clfs_number, 1, 0.1)
trainer = pl.Trainer(accelerator='gpu', devices=1, max_epochs=max_epochs)
#trainer.fit(meta_layer, train_dataloaders=train_loader, val_dataloaders=val_loader)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
trainer.fit(meta_layer, train_dataloaders=train_loader, val_dataloaders=val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type                      | Params
-----------------------------------------------------------
0 | encoder      | MultiHeadAttentionEncoder | 2.9 K 
1 | output_layer | Linear                    | 1.2 K 
2 | criterion    | CrossEntropyLoss          | 0     
-----------------------------------------------------------
4.2 K     Trainable params
0         Non-trainable params
4.2 K     Total params
0.017     Total estimated model params size (MB)


TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates