In [2]:
import os
import pandas as pd
from typing import List, Tuple, Union

In [3]:
data_root_path = '../data'
test_data_path = os.path.join(data_root_path, 'test_data.csv')
phraze_classes_test_path = os.path.join(data_root_path, 'phraze_classes_test.csv')

test_data = pd.read_csv(test_data_path)
phraze_classes_test = pd.read_csv(phraze_classes_test_path)
phraze_classes_test.index = phraze_classes_test['phraze']
PHRAZE_CLASSES_TEST = phraze_classes_test

In [4]:
def get_phraze_class(
    q: str,
    phraze_classes_df: pd.DataFrame
) -> set:
    """Get all phrazes of the same class as given phraze.
    
    Parameters
    ----------
    q : str
        Phraze (query).
    phraze_classes_df : pd.DataFrame
        Data frame of (phraze, class), where phraze is index.
        
    Return
    ------
    class_phrazes : set
        Unique phrazes of the same class as given phraze.
    """
    
    target_calss = phraze_classes_df.loc[q]['class']
    class_phrazes = set(phraze_classes_df[
        phraze_classes_df['class'] == target_calss].index)
    return class_phrazes

In [5]:
# 4. Отсечь или добавить нули:
def complete_predict(
    n: int,
    ordered_list_of_pred: List[Tuple[str, float]]
) -> List[Tuple[str, float]]:
    """Slice by first n and complete predict with zeros.
    
    Parameters
    ----------
    n : int
        Amount of phrazes in target class.
    ordered_list_of_pred : List[Tuple[str, float]]
        List of ordered predictions from model.
        
    Return
    ------
    completed : List[Tuple[str, float]]
        Completed predict."""
    
    sliced = ordered_list_of_pred[:n]
    completed = sliced + [('', 0)]*(n-len(sliced))
    return completed

In [6]:
test_pred = [
    ('pr_1', 0.4),
    ('MFP Michelin', 0.8),
    ('pr_3', 0.55),
    ('Michelin India Technology Center Llp', 0.3),
]

q = 'Shanghai Michelin Tire Co Ltd'

In [7]:
class_q = get_phraze_class(q, PHRAZE_CLASSES_TEST)
class_q

{'MFP Michelin',
 'Michelin Americas Research',
 'Michelin Espana Portugal Sa',
 'Michelin Group',
 'Michelin India Private Ltd',
 'Michelin India Tamilnadu Tyres Private Ltd',
 'Michelin India Technology Center',
 'Michelin India Technology Center Llp',
 'Michelin Na Asrc',
 'Michelin North America Canada Inc',
 'Michelin North America Inc',
 'Michelin Research And Development Center Shanghai Co Ltd',
 'Michelin Shenyang Tire Co Ltd',
 'Michelin Shenyang Tyre Co Ltd',
 'Michelin Siam Co Ltd',
 'Michelin Tyre Plc',
 'Shanghai Michelin Tire Co Ltd',
 'Shanghai Michelin Warrior Tyre Co Ltd',
 'Sociedade Michelin De Participacoes Ind E Comercio Ltda',
 'Sociedade Michelin De Participacoes Indust E Comercio L'}

In [8]:
q_class_n = len(class_q)
t = complete_predict(q_class_n, test_pred)
t[:7]

[('pr_1', 0.4),
 ('MFP Michelin', 0.8),
 ('pr_3', 0.55),
 ('Michelin India Technology Center Llp', 0.3),
 ('', 0),
 ('', 0),
 ('', 0)]

In [9]:
# 5. Преобразовать к {0, 1} по порогу:
def smooth_by_threshold(
    p: float,
    list_of_pred: List[Tuple[str, float]]
) -> List[Tuple[str, float]]:
    """Smooth predicitons probabilty by given threshold.
    
    Parameters
    ----------
    p : float
        Threshold.
    list_of_pred : List[Tuple[str, float]]
        List of predictions from model.
        
    Return
    ------
    smoothed : List[Tuple[str, float]]
        Smoothed list of predicitons.
    """
    
    smoothed = [(phr, int(prob > p)) for phr, prob in list_of_pred]
    return smoothed

In [10]:
u = smooth_by_threshold(0.5, t)
u[:7]

[('pr_1', 0),
 ('MFP Michelin', 1),
 ('pr_3', 1),
 ('Michelin India Technology Center Llp', 0),
 ('', 0),
 ('', 0),
 ('', 0)]

In [11]:
def y_true_func(
    true_phrazes: set,
    list_of_pred: List[Tuple[str, float]]
) -> List[Tuple[str, float]]:
    """Check if predicted phrazes invloved in true ones.
    
    Parameters
    ----------
    true_phrazes : set
        Set of true phrazes.
    list_of_pred : List[Tuple[str, float]]
        List of predictions from model.
        
    Return
    ------
    checked : List[Tuple[str, float]]
        Checked list of predicitons.
    """
    
    checked = [
        (phr, int(phr in true_phrazes)*prob) for phr, prob in list_of_pred
    ]
    return checked

In [12]:
r = y_true_func(class_q, u)
r[:7]

[('pr_1', 0),
 ('MFP Michelin', 1),
 ('pr_3', 0),
 ('Michelin India Technology Center Llp', 0),
 ('', 0),
 ('', 0),
 ('', 0)]

In [13]:
# 6. Presision
def ranking_precision(
    n: int,
    list_of_pred: List[Tuple[str, float]]
) -> float:
    precision = sum([prob for _, prob in list_of_pred]) / n
    return precision

In [14]:
ranking_precision(q_class_n, r)

0.05

In [15]:
def fix_precision_in_empty_case(
    list_of_pred: List[Tuple[str, float]], k=10, m=3
):
    err = sum([prob for _, prob in list_of_pred])
    fixed_precision = (1 / (1 + err))**m
    return fixed_precision

In [16]:
class RankingPrecision:
    def __init__(self, phraze_classes_df):
        self.phraze_classes_df = phraze_classes_df
    
    @staticmethod
    def _evaluate_single_precision(
        class_q: set,
        smoothed_pred: List[Tuple[str, float]]
    ) -> float:
        pred_vs_true = y_true_func(class_q, smoothed_pred)
        q_class_n = len(class_q)
        precision = ranking_precision(q_class_n, pred_vs_true)
        return precision
        
    def __call__(
        self,
        q: str,
        p: Union[float, List[float]],
        list_of_preds: List[Tuple[str, float]]
    ) -> float:
        """Evaluate ranking precision.
        
        Parameters
        ----------
        p : float | List[floats]
            Threshold.
        q : str
            Phraze (query).
        phraze_classes_df : pd.DataFrame
            Data frame of (phraze, class), where phraze is index.
        
        Return
        ------
        ranking_precision : float
            Ranking precision.
        """
        
        class_q = get_phraze_class(q, self.phraze_classes_df) - {q}
        q_class_n = len(class_q)
        
        if not class_q:
            if isinstance(p, float):
                smoothed_pred = smooth_by_threshold(p, list_of_preds)
                precision = fix_precision_in_empty_case(smoothed_pred)
            else:
                smoothed_pred = [
                    smooth_by_threshold(pi, list_of_preds)
                    for pi in p
                ]
                precision = [
                    fix_precision_in_empty_case(sp)
                    for sp in smoothed_pred
                ]
                
        else:
            completed_pred = complete_predict(q_class_n, list_of_preds)
            if isinstance(p, float):
                smoothed_pred = smooth_by_threshold(p, completed_pred)
                precision = self._evaluate_single_precision(
                    class_q, smoothed_pred
                )
            else:
                smoothed_pred = [
                    smooth_by_threshold(pi, completed_pred)
                    for pi in p
                ]
                precision = [
                    self._evaluate_single_precision(class_q, sp)
                    for sp in smoothed_pred
            ]
        return precision

In [17]:
rank_prec = RankingPrecision(PHRAZE_CLASSES_TEST)

In [18]:
test_pred

[('pr_1', 0.4),
 ('MFP Michelin', 0.8),
 ('pr_3', 0.55),
 ('Michelin India Technology Center Llp', 0.3)]

In [19]:
rank_prec(q, 0.5, test_pred)

0.05263157894736842

In [20]:
rank_prec(q, 0.5, [])

0.0

In [21]:
rank_prec(q, [0.5, 0.2], test_pred)

[0.05263157894736842, 0.10526315789473684]

In [22]:
rank_prec(q, [0.5, 0.2], [])

[0.0, 0.0]

In [23]:
rank_prec('Selati Sp A', 0.5, [] )

1.0

In [25]:
rank_prec(
    'Selati Sp A', [0.5, 0.2],
    [
        ('MFP Michelin', 0.8),
        ('pr_3', 0.25)
    ])

[0.125, 0.03703703703703703]

In [31]:
q = 'Michelin India Technology Center Llp'

y_true = get_phraze_class(q, PHRAZE_CLASSES_TEST)
print(f'y_true: {y_true}')

y_pred = [
    ('MFP Michelin', 0),
    ('phraze_1', 0),
    ('phraze_2', 0),
    ('phraze_3', 0),
    ('phraze_4', 0),
]

y_true: {'Sociedade Michelin De Participacoes Indust E Comercio L', 'Michelin Research And Development Center Shanghai Co Ltd', 'MFP Michelin', 'Shanghai Michelin Warrior Tyre Co Ltd', 'Michelin India Technology Center Llp', 'Michelin Espana Portugal Sa', 'Michelin India Private Ltd', 'Michelin North America Canada Inc', 'Michelin Group', 'Michelin Tyre Plc', 'Shanghai Michelin Tire Co Ltd', 'Michelin India Technology Center', 'Michelin Siam Co Ltd', 'Sociedade Michelin De Participacoes Ind E Comercio Ltda', 'Michelin Shenyang Tyre Co Ltd', 'Michelin India Tamilnadu Tyres Private Ltd', 'Michelin Americas Research', 'Michelin North America Inc', 'Michelin Na Asrc', 'Michelin Shenyang Tire Co Ltd'}


In [32]:
rank_prec(
    q , [0.5, 0.2, 0.99], y_pred)

[0.0, 0.0, 0.0]