In [1]:
from typing import List, Dict, Any

from collections import defaultdict
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score

import ml_metrics
import plotly.graph_objs as go
import plotly.graph_objects as go
import plotly.express as px

import random
from keywords import datasets, extractors

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wilcoln/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/wilcoln/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


## Randomly choose data

In [2]:
def get_data(name: str, 
             n_: int):
    '''
    This function will fetch the data based on the name of the dataset. It will:
       1. load the dataset
       2. get rid of records in which # of keywords > # of words in the original text
       3. randomly sample numbers of dataset to use, based on 2nd parameter: n
    '''
    from keywords import datasets
    data = datasets.load(name)
    
    data_ = data['corpora']
    new_data = []
    
    # get rid of data in which # of keywords > # words in text
    for i in range(len(data_)):
        text = data_[i]['text']
        keys = data_[i]['keywords']
        
        if len(text.split()) > len(keys) + 30:
            new_data.append(data_[i])
    
    # sample the data without replacements
    test_texts = random.sample(new_data, n_)
    
    
    
    return test_texts

## Initialize the algoritms

In [3]:
def get_output(documents: List[str],
                text:str, 
                keys:List[str]):
    '''
    This function will initialize all algorithms,
    and output the results from them. It then wrap 
    all results into a dictionary. 
    
    Notice: 
        If new algorithms has been added in, then new
        initialization code needs to be added here
    '''
    
    test_extractors = [
        extractors.YakeExtractor,
        extractors.TextRankExtractor,
        extractors.KPMinerExtractor,
        extractors.TfIdfExtractor,
        extractors.RakeExtractor,
        extractors.TopicRankExtractor,
        extractors.SingleRankExtractor,
        extractors.PositionRankExtractor,
        extractors.MultipartiteRankExtractor,
#         extractors.KeyBertExtractor,
    ]

    output_dic = []
    for extractor in test_extractors:
        model = extractor(n_gram=4, total_keywords_in_training=300, documents=documents)
        tuples = [(item['keyword'], item['score']) for item in model.predict(text=text, topn=len(keys))]
        sorted_list = [item[0] for item in tuples]
        output_dic.append({model.__class__.__name__: sorted_list, 'expected': keys})
    
    return output_dic

In [10]:
# average precison
def apk(actual: List[str], 
        predicted: List[str], 
        k: int):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

# mean average precision
def mapk(actual: List[str], 
         predicted: List[str], 
         k: int):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

def visual_bench(name: str, n:int, k: int=None):
    """
    Visualize mean average precision for each algorithm given random data 
    Parameters
    ----------
    name: the name of the original dataset
                 
    k : int, optional
        The maximum number of predicted elements, if it's none, then it will be set equals to # of goldkeys for each pair
        
    n: int
        The number of data that needs to be choose
        
    Returns
    -------
     map scores for each algorithms : Pandas dataframe
            A pandas dataframe containing map scores for each algorithm
    """
    data = get_data(name, n)
    xs = []
    output_dict_ = []
    for i in range(len(data)):
        text = data[i]['text']
        keys = data[i]['keywords']
    
        if k == None:
            k = len(keys)
        
        output_dict = get_output([text],text, keys)
        output_dict_.append(output_dict)
        xs.append(data[i]['id']) # store dataid
    
    
    # get map scores for each data
    scores = [] 
    for i in range(len(output_dict_)):
        results = output_dict_[i]
        
        mpk_scores = []
        for j in range(len(results)):
            predicted = results[j][f'{list(results[j].keys())[0]}']
            actual = results[j][f'{list(results[j].keys())[1]}']
            mapk_score = mapk(actual, predicted, k)
            
            mpk_scores.append({f'{list(results[j].keys())[0]}': mapk_score})
            
        scores.append(mpk_scores)
    
    # formalize scores
    scores_ = []
    for item in scores:
        s = []
        for i in item:
            s.extend(list(i.values()))
        scores_.append(s)
        
    # retrieve name of algoritms    
    algo = [list(item.keys())[0] for item in scores[0]]

    # create a dataframe
    df = pd.DataFrame(data = scores_, index = xs, columns = algo).reset_index()\
                    .rename(columns={"index":'text'})
        
    # visualization
    df_=pd.melt(df,id_vars=['text'],var_name='algo', value_name='MAP value')
    fig = px.bar(df_, 
             x="text", 
             color="algo",
             y='MAP value',
             title="Mean Average Precision",
             barmode='group',
            )

    fig.show()
    
    return df


In [11]:
visual_bench(name='500N-KPCrowd-v1.1', n=10, k=None)


Mean of empty slice.


invalid value encountered in double_scalars



Unnamed: 0,text,YakeExtractor,TextRankExtractor,KPMinerExtractor,TfIdfExtractor,RakeExtractor,TopicRankExtractor,SingleRankExtractor,PositionRankExtractor,MultipartiteRankExtractor
0,health-20941742,0.252192,0.16527,0.134444,0.306566,0.182344,0.158011,0.199834,0.152816,0.231822
1,sports-20954133,0.171345,0.163714,0.214694,0.183172,0.20037,0.189796,0.195935,0.161171,0.168535
2,art_and_culture-20927516,0.171543,0.178411,0.078974,0.192424,0.191976,0.179303,0.218862,0.145034,0.151441
3,art_and_culture-20900470,0.196294,0.182044,0.189602,0.256757,0.204535,0.199132,0.216046,0.235517,0.160002
4,art_and_culture-20924855,0.223431,0.171097,0.189826,0.169709,0.169985,0.159899,0.213642,0.188371,0.132202
5,politics_us-20952164,0.236509,0.125668,,0.244724,0.171408,0.238582,0.150124,0.320472,0.294671
6,crime-20955365,0.148509,0.231578,0.161037,0.208023,0.175247,0.124222,0.218461,0.173089,0.157529
7,health-20937946,0.179519,0.172604,0.147727,0.277724,0.202022,0.148527,0.185664,0.225334,0.178873
8,tech-20949665,0.182077,0.206927,0.173013,0.212261,0.18326,0.183802,0.194885,0.185163,0.200986
9,sports-20953328,0.22761,0.178744,0.134699,0.247108,0.219195,0.197171,0.186193,0.189241,0.186495
