In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../")

import numpy as np
import pandas as pd
# np.__path__

In [2]:
import reranking

# Re-rank
item_attribute = ["a1", "a1", "a1", "a2", "a1", "a1", "a1", "a2", "a2", "a1"]
desired_distribution = {"a1": 0.5, "a2": 0.5}
rerank_indices = reranking.rerank(
    item_attribute,  # attributes of the ranked items
    desired_distribution,  # desired item distribution
    max_na=None,  # to control the max number of attributes applied
    k_max=None,  # length of output, if None, k_max is the length of `item_attribute`
    algorithm="det_greedy",  # "det_greedy", "det_cons", "det_relaxed", "det_const_sort"
    verbose=False,  # if True, the output is with detailed information
)
print(f"The re-rank indices are: {rerank_indices}.")
item_attribute_reranked = [item_attribute[i] for i in rerank_indices]
before = reranking.ndkl(item_attribute, desired_distribution)
after = reranking.ndkl(item_attribute_reranked, desired_distribution)
print(f"The NDKL metric of before and after re-ranking are {before:.3f} and {after:.3f}, respectively.")

The re-rank indices are: [0, 3, 1, 7, 2, 8, 4, 5, 6, 9].
The NDKL metric of before and after re-ranking are 0.412 and 0.172, respectively.


# Synthetic data

In [3]:
rankings = [i for i in range(100)]
np.random.seed(seed=42)
candidate_ids = np.random.choice(rankings, 100, replace=False)
genders = np.random.choice(["male"]*70 +["female"]*30, 100, replace=False)
locations = np.random.choice(["Tokyo"]*40 +["Kanagawa"]*30 + ["others"]*30, 100, replace=False)
ages = np.random.choice(["20s"]*20 +["30s"]*40 + ["40s"]*20 + ["others"]*20, 100, replace=False)
df_pseudo = pd.DataFrame({'candidate_id':candidate_ids, 'model_rank': rankings, 
                          "gender":genders, "age":ages, "location": locations})
df_pseudo["age_location"] = df_pseudo.age + "_" + df_pseudo.location
df_pseudo.head(10)

Unnamed: 0,candidate_id,model_rank,gender,age,location,age_location
0,83,0,female,others,Kanagawa,others_Kanagawa
1,53,1,male,20s,Tokyo,20s_Tokyo
2,70,2,male,30s,others,30s_others
3,45,3,male,40s,others,40s_others
4,44,4,male,others,Tokyo,others_Tokyo
5,39,5,male,others,Tokyo,others_Tokyo
6,22,6,female,30s,Kanagawa,30s_Kanagawa
7,80,7,male,others,Tokyo,others_Tokyo
8,10,8,male,20s,others,20s_others
9,0,9,male,others,others,others_others


# Experiments
Edge conditions (desired attribute item is insufficient) are tested by unittest.

In [4]:
from reranking.algs import Reranking

dedired_distri1 = {"male": 0.7, "female": 0.3}
dedired_distri2 = {"20s": 0.3, "30s": 0.3, "40s": 0.3, "others": 0.1}
dedired_distri3 = {"Tokyo": 0.3, "Kanagawa": 0.3, "others": 0.4}
dedired_distri4 = {"30s_Kanagawa": 0.2, "20s_Tokyo": 0.1, "others_Tokyo": 0.0, "30s_others": 0.0, "30s_Tokyo": 0.3, 
                   "40s_Tokyo": 0.1, "20s_others": 0.0, "others_others": 0.0, "40s_others": 0.0, "40s_Kanagawa": 0.1, 
                   "others_Kanagawa": 0.0, "20s_Kanagawa": 0.2} # zero valued attributes can be removed

r1 = Reranking(genders, dedired_distri1)
r2 = Reranking(ages, dedired_distri2)
r3 = Reranking(locations, dedired_distri3)
r4 = Reranking(df_pseudo.age_location.tolist(), dedired_distri4)

In [5]:
for dd, ranker, algorithm in zip(
    [dedired_distri1, dedired_distri2, dedired_distri3, dedired_distri4],
    [r1, r2, r3, r4],
    ["det_greedy", "det_cons", "det_relaxed", "det_const_sort"],
):
    df = ranker(algorithm=algorithm, verbose=True, k_max=10)
    print(f"Algorithm: `{algorithm}`")
    print(f"Before re-ranking: {ranker.df_formatted.sort_values('model_rank').iloc[:10].attribute.value_counts(normalize=True).to_dict()}")
    print(f"Desired distribution: {dd}")
    print(f"After re-ranking: {df.attribute.value_counts(normalize=True).to_dict()}")
    print()

Algorithm: `det_greedy`
Before re-ranking: {'male': 0.8, 'female': 0.2}
Desired distribution: {'male': 0.7, 'female': 0.3}
After re-ranking: {'male': 0.7, 'female': 0.3}

Algorithm: `det_cons`
Before re-ranking: {'others': 0.5, '20s': 0.2, '30s': 0.2, '40s': 0.1}
Desired distribution: {'20s': 0.3, '30s': 0.3, '40s': 0.3, 'others': 0.1}
After re-ranking: {'20s': 0.3, '30s': 0.3, '40s': 0.3, 'others': 0.1}

Algorithm: `det_relaxed`
Before re-ranking: {'Tokyo': 0.4, 'others': 0.4, 'Kanagawa': 0.2}
Desired distribution: {'Tokyo': 0.3, 'Kanagawa': 0.3, 'others': 0.4}
After re-ranking: {'others': 0.4, 'Kanagawa': 0.3, 'Tokyo': 0.3}

Algorithm: `det_const_sort`
Before re-ranking: {'others_Tokyo': 0.3, 'others_Kanagawa': 0.1, '20s_Tokyo': 0.1, '30s_others': 0.1, '40s_others': 0.1, '30s_Kanagawa': 0.1, '20s_others': 0.1, 'others_others': 0.1}
Desired distribution: {'30s_Kanagawa': 0.2, '20s_Tokyo': 0.1, 'others_Tokyo': 0.0, '30s_others': 0.0, '30s_Tokyo': 0.3, '40s_Tokyo': 0.1, '20s_others': 0.

# Test if greedy algorithms will be infeasible when attribute category is over 3
(Seems to perform normally in the high attribute category situation.)

In [6]:
print(f"Test greedy algorithms with {len(dedired_distri4)} attribute category situation.")
print(f"Before re-ranking: {r4.df_formatted.sort_values('model_rank').iloc[:10].attribute.value_counts(normalize=True).to_dict()}")
print(f"Desired distribution: {dedired_distri4}")


for algorithm in ["det_greedy", "det_cons", "det_relaxed"]:
    df = r4(algorithm= algorithm, verbose=True, k_max=10)
    print()
    print(f"Algorithm: `{algorithm}`: ")
    print(f"After re-ranking: {df.attribute.value_counts(normalize=True).to_dict()}") 


Test greedy algorithms with 12 attribute category situation.
Before re-ranking: {'others_Tokyo': 0.3, 'others_Kanagawa': 0.1, '20s_Tokyo': 0.1, '30s_others': 0.1, '40s_others': 0.1, '30s_Kanagawa': 0.1, '20s_others': 0.1, 'others_others': 0.1}
Desired distribution: {'30s_Kanagawa': 0.2, '20s_Tokyo': 0.1, 'others_Tokyo': 0.0, '30s_others': 0.0, '30s_Tokyo': 0.3, '40s_Tokyo': 0.1, '20s_others': 0.0, 'others_others': 0.0, '40s_others': 0.0, '40s_Kanagawa': 0.1, 'others_Kanagawa': 0.0, '20s_Kanagawa': 0.2}

Algorithm: `det_greedy`: 
After re-ranking: {'30s_Tokyo': 0.3, '30s_Kanagawa': 0.2, '20s_Kanagawa': 0.2, '20s_Tokyo': 0.1, '40s_Tokyo': 0.1, '40s_Kanagawa': 0.1}

Algorithm: `det_cons`: 
After re-ranking: {'30s_Tokyo': 0.3, '30s_Kanagawa': 0.2, '20s_Kanagawa': 0.2, '20s_Tokyo': 0.1, '40s_Tokyo': 0.1, '40s_Kanagawa': 0.1}

Algorithm: `det_relaxed`: 
After re-ranking: {'30s_Tokyo': 0.3, '30s_Kanagawa': 0.2, '20s_Kanagawa': 0.2, '20s_Tokyo': 0.1, '40s_Tokyo': 0.1, '40s_Kanagawa': 0.1}
