In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../")

import numpy as np
import pandas as pd
from reranking.algs import Reranking
np.__path__

['/Users/longhao.yuan/.pyenv/versions/3.9.0/envs/tensor-research/lib/python3.9/site-packages/numpy']

# Synthetic data

In [2]:
rankings = [i for i in range(100)]
np.random.seed(seed=43)
candidate_ids = np.random.choice(rankings, 100, replace=False)
genders = np.random.choice(["male"]*70 +["female"]*30, 100, replace=False)
locations = np.random.choice(["Tokyo"]*40 +["Kanagawa"]*30 + ["others"]*30, 100, replace=False)
ages = np.random.choice(["20s"]*20 +["30s"]*40 + ["40s"]*20 + ["others"]*20, 100, replace=False)
df_pseudo = pd.DataFrame({'candidate_id':candidate_ids, 'model_rank': rankings, "gender":genders, "age":ages, "location": locations})
df_pseudo["age_location"] = df_pseudo.age + "_" + df_pseudo.location
df_pseudo.head(10)

Unnamed: 0,candidate_id,model_rank,gender,age,location,age_location
0,20,0,male,others,Tokyo,others_Tokyo
1,2,1,male,30s,Kanagawa,30s_Kanagawa
2,15,2,female,30s,others,30s_others
3,22,3,female,40s,Tokyo,40s_Tokyo
4,57,4,female,others,Tokyo,others_Tokyo
5,91,5,male,others,Kanagawa,others_Kanagawa
6,69,6,male,30s,Tokyo,30s_Tokyo
7,55,7,male,20s,others,20s_others
8,11,8,male,20s,others,20s_others
9,79,9,male,30s,Kanagawa,30s_Kanagawa


# Experiments

In [3]:
print(df_pseudo.age_location.value_counts(normalize=True).to_dict())

{'30s_Kanagawa': 0.2, '20s_Tokyo': 0.11, 'others_Tokyo': 0.1, '30s_others': 0.1, '30s_Tokyo': 0.1, '40s_Tokyo': 0.09, '20s_others': 0.07, 'others_others': 0.07, '40s_others': 0.06, '40s_Kanagawa': 0.05, 'others_Kanagawa': 0.03, '20s_Kanagawa': 0.02}


In [4]:
dedired_distri1 = {"male": 0.7, "female": 0.3}
dedired_distri2 = {"20s": 0.3, "30s": 0.3, "40s": 0.3, "others": 0.1}
dedired_distri3 = {"Tokyo": 0.3, "Kanagawa": 0.3, "others": 0.4}
dedired_distri4 = {
        "30s_Kanagawa": 0.2,
        "20s_Tokyo": 0.1,
        "others_Tokyo": 0.0,
        "30s_others": 0.0,
        "30s_Tokyo": 0.3,
        "40s_Tokyo": 0.1,
        "20s_others": 0.0,
        "others_others": 0.0,
        "40s_others": 0.0,
        "40s_Kanagawa": 0.1,
        "others_Kanagawa": 0.0,
        "20s_Kanagawa": 0.2,
    }

r1 = Reranking(candidate_ids, genders, dedired_distri1)
r2 = Reranking(candidate_ids, ages, dedired_distri2)
r3 = Reranking(candidate_ids, locations, dedired_distri3)
r4 = Reranking(candidate_ids, df_pseudo.age_location.tolist(), dedired_distri4)

df1 = r1.re_rank(algorithm= "det_greedy", verbose=True, k_max=10)
df2 = r2.re_rank(algorithm= "det_cons", verbose=True, k_max=10)
df3 = r3.re_rank(algorithm= "det_relaxed", verbose=True, k_max=10)
df4 = r4.re_rank(algorithm= "det_const_sort", verbose=True, k_max=10)

In [5]:
for dd, ranker, df, name in zip(
    [dedired_distri1, dedired_distri2, dedired_distri3, dedired_distri4],
    [r1, r2, r3, r4],
    [df1, df2, df3, df4],
    ["det_greedy", "det_cons", "det_relaxed", "det_const_sort"],
):
    print(f"Algorithm: {name} ")
    print(f"Desired distribution: {dd}")
    print(
        f"Before re-ranking: {ranker.df_formated.loc[:10].attribute.value_counts(normalize=True).to_dict()}"
    )
    print(f"After re-ranking: {df.attribute.value_counts(normalize=True).to_dict()}")
    print()

Algorithm: det_greedy 
Desired distribution: {'male': 0.7, 'female': 0.3}
Before re-ranking: {'male': 1.0}
After re-ranking: {'male': 0.7, 'female': 0.3}

Algorithm: det_cons 
Desired distribution: {'20s': 0.3, '30s': 0.3, '40s': 0.3, 'others': 0.1}
Before re-ranking: {'others': 1.0}
After re-ranking: {'30s': 0.3, '40s': 0.3, '20s': 0.3, 'others': 0.1}

Algorithm: det_relaxed 
Desired distribution: {'Tokyo': 0.3, 'Kanagawa': 0.3, 'others': 0.4}
Before re-ranking: {'Tokyo': 1.0}
After re-ranking: {'others': 0.4, 'Tokyo': 0.3, 'Kanagawa': 0.3}

Algorithm: det_const_sort 
Desired distribution: {'30s_Kanagawa': 0.2, '20s_Tokyo': 0.1, 'others_Tokyo': 0.0, '30s_others': 0.0, '30s_Tokyo': 0.3, '40s_Tokyo': 0.1, '20s_others': 0.0, 'others_others': 0.0, '40s_others': 0.0, '40s_Kanagawa': 0.1, 'others_Kanagawa': 0.0, '20s_Kanagawa': 0.2}
Before re-ranking: {'others_Tokyo': 0.9090909090909091, '30s_Kanagawa': 0.09090909090909091}
After re-ranking: {'30s_Tokyo': 0.3, '30s_Kanagawa': 0.2, '20s_Kana

# Test if greedy algorithms will be infeasible when attribute number is over 3
(Seems to perform normally in the high attribute number condition.)

In [6]:
print(f"Test greedy algorithms with {len(dedired_distri4)} attributes.")
print(f"Desired distribution: {dedired_distri4}")
print(f"Before re-ranking: {r4.df_formated.loc[:10].attribute.value_counts(normalize=True).to_dict()}")

for name in ["det_greedy", "det_cons", "det_relaxed"]:
    df = r4.re_rank(algorithm= name, verbose=True, k_max=10)
    print()
    print(f"Algorithm: {name}: ")
    print(f"After re-ranking: {df.attribute.value_counts(normalize=True).to_dict()}") 


Test greedy algorithms with 12 attributes.
Desired distribution: {'30s_Kanagawa': 0.2, '20s_Tokyo': 0.1, 'others_Tokyo': 0.0, '30s_others': 0.0, '30s_Tokyo': 0.3, '40s_Tokyo': 0.1, '20s_others': 0.0, 'others_others': 0.0, '40s_others': 0.0, '40s_Kanagawa': 0.1, 'others_Kanagawa': 0.0, '20s_Kanagawa': 0.2}
Before re-ranking: {'others_Tokyo': 0.9090909090909091, '30s_Kanagawa': 0.09090909090909091}

Algorithm: det_greedy: 
After re-ranking: {'30s_Tokyo': 0.3, '30s_Kanagawa': 0.2, '20s_Kanagawa': 0.2, '40s_Tokyo': 0.1, '20s_Tokyo': 0.1, '40s_Kanagawa': 0.1}

Algorithm: det_cons: 
After re-ranking: {'30s_Tokyo': 0.3, '30s_Kanagawa': 0.2, '20s_Kanagawa': 0.2, '40s_Tokyo': 0.1, '20s_Tokyo': 0.1, '40s_Kanagawa': 0.1}

Algorithm: det_relaxed: 
After re-ranking: {'30s_Tokyo': 0.3, '30s_Kanagawa': 0.2, '20s_Kanagawa': 0.2, '40s_Tokyo': 0.1, '20s_Tokyo': 0.1, '40s_Kanagawa': 0.1}
