In [4]:
from sentence_transformers import SentenceTransformer, models, losses, InputExample
from torch import nn, Tensor
from typing import List, Union, Iterable, Dict
import torch
from torch.utils.data import DataLoader

import dataclasses
from pathlib import Path
from typing import List
import pandas as pd
from itertools import count
import typer
from enum import Enum
from typing import Optional
from collections import defaultdict
import torch
import numpy as np
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer, InputExample, losses
from dos.evaluator import CorrelationEvaluator
from dos.dataset import SemEvalDataset, ArticlePair
from torch.utils.data import DataLoader

from dos.multitask_evaluator import MultitaskCorrelationEvaluator

In [18]:
from torch import Tensor
from torch import nn
from typing import Dict
import torch.nn.functional as F

class ReshapeAndNormalize(nn.Module):
    """
    This layer normalizes embeddings to unit length
    """
    def __init__(self, num_labels: int):
        super(ReshapeAndNormalize, self).__init__()
        self.num_labels = num_labels

    def forward(self, features: Dict[str, Tensor]):
        batch_size, _ = features['sentence_embedding'].shape
        features.update({'sentence_embedding': features['sentence_embedding'].reshape(batch_size, self.num_labels, -1)})
        features.update({'sentence_embedding': F.normalize(features['sentence_embedding'], p=2, dim=2)})
        return features

    def save(self, output_path):
        pass

    @staticmethod
    def load(input_path):
        return ReshapeAndNormalize()

In [19]:
model = SentenceTransformer("sentence-transformers/LaBSE")

In [20]:
sentences = ["Tim ist toll"]

In [21]:
sentence_embeddings = model.encode(sentences=sentences)
sentence_embeddings.shape

(1, 768)

In [22]:
model.add_module("3", models.Dense(in_features=768, out_features=512*7, activation_function=nn.Tanh()))
model.add_module("4", ReshapeAndNormalize(num_labels=7))
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Dense({'in_features': 768, 'out_features': 768, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
  (3): Dense({'in_features': 768, 'out_features': 3584, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
  (4): ReshapeAndNormalize()
)

In [23]:
sentence_embeddings = model.encode(sentences=sentences)
sentence_embeddings.shape

(1, 7, 512)

In [25]:
sentence_embeddings[0][0]

array([-0.05088175, -0.05160476,  0.05517431,  0.011707  , -0.00567881,
        0.03671409,  0.0588317 , -0.07588776,  0.07630678,  0.01865661,
        0.01614934,  0.06369347,  0.05437566,  0.04573026, -0.01723144,
        0.00204056,  0.0681881 , -0.05109839,  0.0934725 , -0.06399766,
        0.02728841,  0.01521847,  0.01204677, -0.01487857, -0.01405284,
       -0.05299329, -0.06663571, -0.06151884, -0.01764604,  0.02136873,
        0.00298715, -0.04767498,  0.0025035 , -0.00124061, -0.01686211,
       -0.01935672,  0.02121147,  0.03551834, -0.03067654,  0.00992155,
       -0.03848063,  0.0081587 ,  0.02219091, -0.01393551,  0.04945447,
        0.10523274,  0.04523982, -0.05995491,  0.06702071,  0.0300389 ,
        0.06169705, -0.01010427, -0.03548419,  0.00346366, -0.02165526,
        0.01710977, -0.02390138, -0.02049441,  0.0579266 ,  0.10538305,
        0.00716479,  0.0168098 , -0.07584294,  0.08076446, -0.0028929 ,
        0.06764733, -0.03247941, -0.04045565, -0.00938718, -0.04

In [30]:
class InputExampleWithMultipleLabels:
    """
    Structure for one input example with texts, the label and a unique id
    """
    def __init__(self, guid: str = '', texts: List[str] = None,  label: List[Union[int, float]] = 0):
        """
        Creates one InputExample with the given texts, guid and label


        :param guid
            id for the example
        :param texts
            the texts for the example.
        :param label
            the label for the example
        """
        self.guid = guid
        self.texts = texts
        self.label = label

    def __str__(self):
        return "<InputExample> labels: {}, texts: {}".format("; ".join(self.label), "; ".join(self.texts))

In [26]:
class CosineSimilarityLossForMultipleLabels(nn.Module):

    def __init__(self, model: SentenceTransformer, loss_fct = nn.MSELoss(), cos_score_transformation=nn.Identity(), num_labels=7):
        super(CosineSimilarityLossForMultipleLabels, self).__init__()
        self.model = model
        self.loss_fct = loss_fct
        self.cos_score_transformation = cos_score_transformation
        self.num_labels = num_labels


    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
        embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
        output = torch.stack([self.cos_score_transformation(torch.cosine_similarity(embeddings[0][:,dim,:], embeddings[1][:,dim,:])) for dim in range(self.num_labels)]).T
        return self.loss_fct(output, labels)

In [27]:
def normalize_score_01(one2four: float):
    return ((1 - 2 * (one2four - 1) / 3) + 1) / 2

In [28]:
def make_training_data(data: List[ArticlePair]) -> List[InputExample]:
    inputs: List[InputExample] = [
        InputExampleWithMultipleLabels(
            texts=[pair.article_1.text, pair.article_2.text],
            label=[
                normalize_score_01(pair.geography), 
                normalize_score_01(pair.entities), 
                normalize_score_01(pair.time), 
                normalize_score_01(pair.narrative), 
                normalize_score_01(pair.overall), 
                normalize_score_01(pair.style), 
                normalize_score_01(pair.tone)
            ]
        )
        for pair in data
    ]
    return inputs

In [31]:
dataset = SemEvalDataset(Path("data/train.csv"), Path("data/train_data"))
train, dev = dataset.random_split(0.8)
training_inputs = make_training_data(train)

Error loading article No articles with id 1484008954
Error loading article No articles with id 1483999027
Error loading article No articles with id 1484182933
Error loading article No articles with id 1483999027
Error loading article No articles with id 1484035175
Error loading article No articles with id 1484007787
Error loading article No articles with id 1484007543
Error loading article No articles with id 1484370272
Error loading article No articles with id 1484368406
Error loading article No articles with id 1484370272
Error loading article No articles with id 1483806257
Error loading article No articles with id 1484038154
Error loading article No articles with id 1484191176
Error loading article No articles with id 1484040035
Error loading article No articles with id 1484452087
Error loading article No articles with id 1483999027
Error loading article No articles with id 1484200356
Error loading article No articles with id 1484188519
Error loading article No articles with id 1484

In [32]:
dataloader = DataLoader(training_inputs, shuffle=True, batch_size=8)
loss = CosineSimilarityLossForMultipleLabels(model, num_labels=7)
evaluator = CorrelationEvaluator(dev)

In [33]:
model.fit(
    train_objectives=[(dataloader, loss)],
    epochs=1,
    warmup_steps=100,
    evaluator=evaluator,
    use_amp=True,
    output_path="models",
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/492 [00:00<?, ?it/s]

torch.Size([8, 7])
torch.Size([8, 7])
torch.Size([8, 7])
torch.Size([8, 7])


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.44 GiB (GPU 0; 23.69 GiB total capacity; 13.53 GiB already allocated; 234.69 MiB free; 15.16 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [71]:
models.Normalize()

Normalize()

In [None]:
models.Dense()

In [None]:
F.normalize(features['sentence_embedding'], p=2, dim=1)