In [1]:
import sys
sys.path.append('..')

In [2]:
import sentence_embedding_evaluation_german as seeg
import torch
import numpy as np
from torch_emb2vec import ConvToVec
from typing import List

## (1) Instantiate your Embedding model
First, you should load your pretrained embedding.

Here we will generate a random embedding for demonstration purposes.

In [3]:
# generate a random embedding
emb_dim = 256
vocab_sz = 128
emb = torch.randn((vocab_sz, emb_dim), requires_grad=False)
emb = torch.nn.Embedding.from_pretrained(emb)
# assert emb.weight.requires_grad == False

## (2a) Specify the preprocessing
The `preprocessor` function converts a sentences as string into embedding vectors of numbers.

Here we will convert the input strings with a nonsensical approach into IDs for the Embedding layer.

In [4]:
SEQLEN = 128
padid = 128

def preprocesser(batch: List[str], params: dict=None) -> List[List[List[float]]]:
    """ Specify your embedding or pretrained encoder here
    Paramters:
    ----------
    params : dict
        The params dictionary
    batch : List[str]
        A list of sentence as string
    Returns:
    --------
    List[List[List[float]]]
        A list of embedded sequences
    """
    features = []
    for sent in batch:
        # encode
        try:
            ids = torch.tensor([ord(c) % 128 for c in sent])
        except:
            print(sent)
        enc = emb(ids)
        # truncate & pad
        h = torch.ones(size=(SEQLEN, enc.shape[1]), dtype=torch.int64) * padid
        end = min(enc.shape[0], SEQLEN)
        try:
            h[:end, :] = enc[:end, :].detach()
        except Exception as e:
            raise Exception(e)
        features.append(h)
    features = torch.stack(features, dim=0)
    features = features.type(torch.float32)
    return features.detach()

## (2b) Specify a Customer Classifier

In [5]:
class CustomClassiferModel(torch.nn.Module):
    def __init__(self,
                 embdim: int,
                 output_size: int,  # nclasses
                 *args, **kwargs):
        super(CustomClassiferModel, self).__init__(*args, **kwargs)
        # Self-MHA layer (=> this will overfit bcoz tiny datasets!)
        #self.mha_pre = torch.nn.LayerNorm(embdim, elementwise_affine=True)
        #self.mha_net = torch.nn.MultiheadAttention(
        #    embed_dim=embdim, num_heads=8, batch_first=True, bias=False)
        # 2D to 1D flattening
        self.to1d_pre = torch.nn.LayerNorm(embdim, elementwise_affine=True)
        # compute kernel size and output channel dim
        self.to1d_net = ConvToVec(
            seq_len=SEQLEN, emb_dim=embdim, num_output=256, trainable=False)
        self.to1d_act = torch.nn.GELU()
        # Final layer
        self.final_pre = torch.nn.LayerNorm(256, elementwise_affine=True)
        self.final_net = torch.nn.Linear(256, output_size, bias=False)
        self.final_act = torch.nn.Softmax(dim=1)
        # init params
        self._reset_parameters()

    def _reset_parameters(self) -> None:
        torch.manual_seed(42)
        # Self-MHA projections
        #for param in self.mha_net.parameters():
        #    torch.nn.init.xavier_normal_(param, gain=1.0)
        # Final layer
        torch.nn.init.xavier_normal_(self.final_net.weight, gain=1.0)

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        # Self-MHA layer
        #print("Inputs", inputs.shape)
        h = inputs
        #h = self.mha_pre(h)
        #h, _ = self.mha_net(query=h, value=h, key=h)
        #h = h + inputs  # skip-conn
        # print("MHA", h.shape)
        # 2D to 1D flattening
        h = self.to1d_pre(h)
        h = self.to1d_net(h)
        h = self.to1d_act(h)
        #print("Flatten", h.shape)
        # Final layer
        h = self.final_pre(h)
        h = self.final_net(h)
        #print("Final", h.shape)
        return self.final_act(h)


def mymodel(**kwargs):
    return CustomClassiferModel(
        embdim=kwargs['n_features'],
        output_size=kwargs['n_classes'])


In [6]:
model = mymodel(n_features=emb_dim, n_classes=3)
n = sum([m.numel() for m in model.parameters()])
print(f"Num params: {n}")

Num params: 68352


## (3) Training settings

In [7]:
params = {
    'datafolder': '../datasets',
    'bias': True,
    'balanced': True,
    'batch_size': 128, 
    'num_epochs': 5,  # Default: 500
    # 'early_stopping': True,
    # 'split_ratio': 0.2,  # if early_stopping=True
    # 'patience': 5,  # if early_stopping=True
}

## (4) Specify downstream tasks

In [8]:
# All
# downstream_tasks = [
#     'TOXIC', 'ENGAGE', 'FCLAIM', 'VMWE',
#     'OL19-A', 'OL19-B', 'OL19-C',
#     'OL18-A', 'OL18-B', 
#     'ABSD-1', 'ABSD-2', 'ABSD-3',
#     'MIO-S', 'MIO-O', 'MIO-I', 'MIO-D', 'MIO-F', 'MIO-P', 'MIO-A',
#     'SBCH-L', 'SBCH-S', 'ARCHI', 'LSDC'
# ]

# Group tasks
# downstream_tasks = [
#     'ABSD-2', 'MIO-S', 'SBCH-S',  # Sentiment analysis
#     'ENGAGE', 'MIO-P',  # engaging/personal
#     'FCLAIM', 'MIO-A',  # fact-claim (potential fake news), argumentative, reasoning
#     'TOXIC', 'OL19-A', 'OL19-B', 'OL19-C', 'MIO-O', 'MIO-I',  # toxic
# ]

# Current favorites
downstream_tasks = ['FCLAIM', 'VMWE', 'OL19-C', 'ABSD-2', 'MIO-P', 'ARCHI', 'LSDC']
#downstream_tasks = ['FCLAIM', 'VMWE', 'OL19-C', 'ABSD-2', 'MIO-P', 'ARCHI']
#downstream_tasks = ['FCLAIM']

## (5) Run experiments

In [9]:
%%time
results = seeg.evaluate(
    downstream_tasks=downstream_tasks, 
    preprocesser=preprocesser, 
    modelbuilder=mymodel,
    verbose=1,
    **params
)

Load dataset: FCLAIM
train:    3244 examples,  256 features
 test:     944 examples,  256 features
epoch 1 | loss: 0.7135430368093344
epoch 2 | loss: 0.7014878277595227
epoch 3 | loss: 0.6915632234169886
epoch 4 | loss: 0.6831112091357892
epoch 5 | loss: 0.6759548943776351
Load dataset: VMWE
train:    6652 examples,  256 features
 test:    1447 examples,  256 features
epoch 1 | loss: 0.7347636016515585
epoch 2 | loss: 0.7263364998193887
epoch 3 | loss: 0.7193957028480676
epoch 4 | loss: 0.7130806021965467
epoch 5 | loss: 0.7075265336495179
Load dataset: OL19-C
train:    1921 examples,  256 features
 test:     930 examples,  256 features
epoch 1 | loss: 0.7554306797683239
epoch 2 | loss: 0.7453916557133198
epoch 3 | loss: 0.734691996127367
epoch 4 | loss: 0.7224976681172848
epoch 5 | loss: 0.7112742327153683
Load dataset: ABSD-2
train:   19432 examples,  256 features
 test:    2555 examples,  256 features
epoch 1 | loss: 1.1387023000340712
epoch 2 | loss: 1.1301173207006956
epoch 3 | lo

In [10]:
import json
dat = json.dumps(results, indent=2)
# print(dat)

## (6) Display results

In [11]:
print("Task | Epochs | N train | N test")
for res in results:
    print(f"{res['task']:>7s}: {res['epochs']:5d} {res['train']['num']:6d} {res['test']['num']:6d}")

Task | Epochs | N train | N test
 FCLAIM:     5   3244    944
   VMWE:     5   6652   1447
 OL19-C:     5   1921    930
 ABSD-2:     5  19432   2555
  MIO-P:     5   4668   4668
  ARCHI:     5  18809   4743
   LSDC:     5  74140   8602


In [12]:
metric = 'acc'  # 'f1', 'f1-balanced', 'acc', 'acc-balanced'
print('  Task | train | test')
for res in results:
    print(f"{res['task']:>7s}: {res['train'][metric]:6.3f} {res['test'][metric]:6.3f}")

  Task | train | test
 FCLAIM:  0.623  0.595
   VMWE:  0.617  0.610
 OL19-C:  0.578  0.541
 ABSD-2:  0.404  0.362
  MIO-P:  0.594  0.586
  ARCHI:  0.281  0.266
   LSDC:  0.126  0.127


In [13]:
metric = 'f1-balanced'  # 'f1', 'f1-balanced', 'acc', 'acc-balanced'
print('  Task | train | test')
for res in results:
    print(f"{res['task']:>7s}: {res['train'][metric]:6.3f} {res['test'][metric]:6.3f}")

  Task | train | test
 FCLAIM:  0.591  0.556
   VMWE:  0.500  0.493
 OL19-C:  0.470  0.453
 ABSD-2:  0.327  0.285
  MIO-P:  0.523  0.512
  ARCHI:  0.264  0.249
   LSDC:  0.079  0.091


In [14]:
# class label distributions (inference)
for res in results:
    print(res['task'], res['test']['num'])
    print(res['test']['distr-test'])
    print(res['test']['distr-pred'])

FCLAIM 944
{'0': '630', '1': '314'}
{'0': '594', '1': '350'}
VMWE 1447
{'1': '256', '0': '1191'}
{'0': '950', '1': '497'}
OL19-C 930
{'0': '796', '1': '134'}
{'1': '423', '0': '507'}
ABSD-2 2555
{'1': '1670', '0': '780', '2': '105'}
{'2': '732', '0': '807', '1': '1016'}
MIO-P 4668
{'1': '812', '0': '3856'}
{'0': '2631', '1': '2037'}
ARCHI 4743
{'2': '1177', '3': '1176', '1': '1199', '0': '1191'}
{'1': '2130', '3': '1052', '0': '1063', '2': '498'}
LSDC 8602
{'13': '280', '6': '346', '12': '940', '3': '925', '2': '944', '1': '934', '11': '931', '0': '453', '10': '511', '5': '924', '4': '65', '8': '923', '9': '83', '7': '343'}
{'11': '496', '2': '1850', '0': '749', '12': '241', '9': '362', '8': '357', '3': '285', '5': '990', '13': '288', '1': '1130', '10': '748', '4': '666', '6': '273', '7': '167'}


In [15]:
# class label distribution (training)
for res in results:
    print(res['task'], res['train']['num'])
    print(res['train']['distr-train'])
    print(res['train']['distr-pred'])

FCLAIM 3244
{'0': '2141', '1': '1103'}
{'0': '2007', '1': '1237'}
VMWE 6652
{'1': '1145', '0': '5507'}
{'1': '2291', '0': '4361'}
OL19-C 1921
{'0': '1664', '1': '257'}
{'1': '798', '0': '1123'}
ABSD-2 19432
{'2': '1179', '1': '13208', '0': '5045'}
{'2': '5311', '1': '8049', '0': '6072'}
MIO-P 4668
{'0': '3855', '1': '813'}
{'1': '2048', '0': '2620'}
ARCHI 18809
{'1': '4797', '3': '4407', '2': '4802', '0': '4803'}
{'0': '3994', '1': '9207', '3': '3633', '2': '1975'}
LSDC 74140
{'12': '15019', '8': '7829', '5': '13506', '1': '5294', '11': '13227', '3': '11002', '2': '5704', '13': '346', '10': '749', '7': '382', '9': '143', '0': '469', '6': '377', '4': '93'}
{'5': '8871', '10': '5629', '2': '19861', '9': '2998', '0': '5828', '1': '8043', '4': '5250', '11': '3864', '12': '2380', '3': '2506', '6': '2162', '8': '2988', '13': '2247', '7': '1513'}
