In [70]:
from typing import List, Union, Any
from operator import itemgetter
import copy
import numpy as np

from deeppavlov.core.common.registry import register
from deeppavlov.core.common.log import get_logger
from deeppavlov.core.models.component import Component
from utils import flatten_nested_list

logger = get_logger(__name__)


@register("ensemble_ranker")
class EnsembleRanker(Component):

    def __init__(self, top_n=10, active=True, *args, **kwargs):
        self.top_n = top_n
        self.active = active

    def __call__(self, tfidf: List[List[List[Union[Any]]]] = None,
                 tfhub: List[List[List[Union[Any]]]] = None,
                 rnet: List[List[List[Union[Any]]]] = None, *args, **kwargs) -> \
            List[List[List[Union[str, int, float]]]]:

        CHUNK_IDX = 3
        SCORE_IDX = 2
        FAKE_SCORE = 0.001
        NORM_THRESH = 50  # take only the first 50 results to count np.linalg.norm?

        if tfidf is not None:
            tfidf = [[list(el) for el in instance] for instance in tfidf]  # instance is a batch of results given a query
        if rnet is not None:
            rnet = [[list(el) for el in instance] for instance in rnet]
        if tfhub is not None:
            tfhub = [[list(el) for el in instance] for instance in tfhub]

        rankers = [r for r in [tfidf, tfhub, rnet] if r is not None]
        num_rankers = len(rankers)

        def update_all_predictions(predictions, ranker_instance):
            for predicted_chunk in ranker_instance:
                chunk_idx = predicted_chunk[CHUNK_IDX]
                if chunk_idx in instance_data_ids:
                    data_idx = list(map(itemgetter(CHUNK_IDX), predictions)).index(chunk_idx)
                    predictions[data_idx][SCORE_IDX] = flatten_nested_list(
                        predictions[data_idx][SCORE_IDX] + [predicted_chunk[SCORE_IDX]])
                else:
                    predicted_chunk[SCORE_IDX] = [predicted_chunk[SCORE_IDX]]
                    predictions.append(predicted_chunk)

        def normalize_scores(ranker_results):
            """
            Normalize paragraph scores with np.linalg.norm(L2) for each batch
            """
            for instance in ranker_results:
                scores = list(map(itemgetter(SCORE_IDX), instance))
                norm = np.linalg.norm(scores[:NORM_THRESH])
                for pred in instance:
                    pred[SCORE_IDX] = float(pred[SCORE_IDX] / norm)

        # Normalize scores from all tfidf and rnet:
        if tfidf is not None:
            normalize_scores(tfidf)
        if rnet is not None:
            normalize_scores(rnet)

        # Count average scores from all rankers
        all_data = []
        for instances in zip(*rankers):
            print('1:', instances, '\n')
            for item in instances[0]:
                item[SCORE_IDX] = [item[SCORE_IDX]]
            print('2:', instances, '\n')

            instance_predictions = copy.deepcopy(instances[0]); print('ins pred:', instance_predictions, '\n')
            instance_data_ids = set(map(itemgetter(CHUNK_IDX), instance_predictions))

            for i in range(1, len(instances)):
                update_all_predictions(instance_predictions, instances[i])
            print('3:', instance_predictions, '\n')

            for prediction in instance_predictions:
                len_scores = len(prediction[SCORE_IDX])
                assert len_scores <= num_rankers
                if len_scores < num_rankers:
                    prediction[SCORE_IDX] = np.mean(
                        prediction[SCORE_IDX] + (num_rankers - len_scores) * [FAKE_SCORE])
                else:
                    prediction[SCORE_IDX] = np.mean(prediction[SCORE_IDX])

            instance_predictions = sorted(instance_predictions, key=itemgetter(SCORE_IDX), reverse=True)

            if self.active:
                instance_predictions = instance_predictions[:self.top_n]

            for i in range(len(instance_predictions)):
                instance_predictions[i][0] = i

            all_data.append(instance_predictions)

        return all_data

In [71]:
ranker = EnsembleRanker(top_n=2, active=True)

In [None]:
tfidf: List[List[List[Union[Any]]]] = None,
tfhub: List[List[List[Union[Any]]]] = None,
rnet: List[List[List[Union[Any]]]] = None,

In [9]:
rank_text_score_id_tfidf = [[[0,'haha', 1.1, 3], [1, 'lala', 0.9, 5], [2, 'jiujiu', 2.3, 1]],
                            [[0, 'lala', 9.8, 5], [1, 'liu', 1.4, 6], [2, 'biubiu', 8.8, 9]]]
rank_text_score_id_use = [[[0, 'liu', 12, 6], [1, 'miumiu', 0.8, 10], [2, 'jiujiu', 3.5, 1]],
                          [[0, 'lala', 7.7, 5], [1, 'halala', 3, 2],[2, 'liu', 10, 6]]]

In [72]:
ranker(rank_text_score_id_tfidf, rank_text_score_id_use)

1: ([[0, 'haha', 0.4068496968004201, 3], [1, 'lala', 0.33287702465488916, 5], [2, 'jiujiu', 0.8506857296736056, 1]], [[0, 'liu', 12, 6], [1, 'miumiu', 0.8, 10], [2, 'jiujiu', 3.5, 1]]) 

2: ([[0, 'haha', [0.4068496968004201], 3], [1, 'lala', [0.33287702465488916], 5], [2, 'jiujiu', [0.8506857296736056], 1]], [[0, 'liu', 12, 6], [1, 'miumiu', 0.8, 10], [2, 'jiujiu', 3.5, 1]]) 

ins pred: [[0, 'haha', [0.4068496968004201], 3], [1, 'lala', [0.33287702465488916], 5], [2, 'jiujiu', [0.8506857296736056], 1]] 

3: [[0, 'haha', [0.4068496968004201], 3], [1, 'lala', [0.33287702465488916], 5], [2, 'jiujiu', [0.8506857296736056, 3.5], 1], [0, 'liu', [12], 6], [1, 'miumiu', [0.8], 10]] 

1: ([[0, 'lala', 0.7398808151391919, 5], [1, 'liu', 0.10569725930559883, 6], [2, 'biubiu', 0.6643827727780499, 9]], [[0, 'lala', 7.7, 5], [1, 'halala', 3, 2], [2, 'liu', 10, 6]]) 

2: ([[0, 'lala', [0.7398808151391919], 5], [1, 'liu', [0.10569725930559883], 6], [2, 'biubiu', [0.6643827727780499], 9]], [[0, 'lala',

[[[0, 'liu', 6.0005, 6], [1, 'jiujiu', 2.1753428648368027, 1]],
 [[0, 'liu', 5.052848629652799, 6], [1, 'lala', 4.219940407569596, 5]]]

In [35]:
rankers = [rank_text_score_id_tfidf, rank_text_score_id_use]
list(zip(*rankers))

[([[0, 'haha', 1.1, 3], [1, 'lala', 0.9, 5], [2, 'jiujiu', 2.3, 1]],
  [[0, 'liu', 12, 6], [1, 'miumiu', 0.8, 10], [2, 'jiujiu', 3.5, 1]]),
 ([[0, 'lala', 9.8, 5], [1, 'liu', 1.4, 6], [2, 'biubiu', 8.8, 9]],
  [[0, 'lala', 7.7, 5], [1, 'halala', 3, 2], [2, 'liu', 10, 6]])]

In [29]:
arr = np.array([x[2] for x in rank_text_score_id_tfidf[1]])
arr/np.sqrt((arr**2).sum())

array([0.73988082, 0.10569726, 0.66438277])

In [11]:
5.801/2

2.9005

In [13]:
(10 + 1.4)/2

5.7