In [22]:
import numpy as np
import pandas as pd
from itertools import combinations
# from calibration.Calibration import Calibration 
# load rankings
reco_matrix = np.load("test/reco_matrix.npy")[0]
# scores = np.load("reco_matrix_mapped_scores.npy")[0]
print(reco_matrix.shape)
# print(scores.shape)

(943, 10)


In [None]:
movies = pd.read_csv(
    "./data/ml-100K/i_id_mapping_genre.csv",
    sep="\t",
    names=[
        "item_id",
        "Name",
        "genres",
        "itemID"
    ],
    header=0,
    # encoding="latin-1",
)
movies = movies.drop(columns=["item_id"])
movies = movies.sort_values(by="itemID")
unique_genres = [
    "Action",
    "Thriller",
    "Romance",
    "Western",
    "Children's",
    "Mystery",
    "Fantasy",
    "Film-Noir",
    "Documentary",
    "Comedy",
    "Adventure",
    "Sci-Fi",
    "Horror",
    "Crime",
    "Musical",
    "War",
    "Animation",
    "Drama",
]
for genre in unique_genres:
    movies[genre] = 0
    
for index, row in movies.iterrows():
    genres = row["genres"].split("|")
    
    for genre in genres:
        movies.at[index, genre] = 1


movies

users = pd.read_csv("./data/ml-100k/u_id_mapping_demographic_.csv", sep="\t")
users = users.sort_values(by="userID")

# users = users.drop(columns=users.columns[0])
gender_map = {"M": 0, "F": 1}
users["Gender"] = users["Gender"].map(gender_map)
user_features_numpy = users.to_numpy()
users



In [131]:
from math import comb
class GenrePrecisionMulti:
    def __init__(self, gender_df, unique_genres,top_k, **kwargs):
        """
        initializating genders of the users
        Parameters
        ----------
        gender_mapping : dict
            A dictionary mapping user IDs to their genders.
        """
        # super().__init__(name="GenrePrecision", **kwargs)
        self.gender_df = gender_df
        self.unique_genres = unique_genres
        self.top_k = top_k

    def compute(self, reco_matrix, item_df,sensitive_attr):
        """
        reco_matrix : n_userxk np array containing the ranked recommended list for users
        item_df : pd df containing all items with ids and genre info as ohe
        returns the abs diff for each gender genre distribution
        sensitive_attr: the sensitive attribute for which we wanna find unfairness 
        """
        # precision of action = action movies / k
        df_reco = pd.DataFrame(
            {
                "userID": np.repeat(np.arange(reco_matrix.shape[0]), self.top_k),
                "itemID": reco_matrix.flatten(),
                "rank": np.tile(np.arange(1, self.top_k + 1), reco_matrix.shape[0]),
            }
        )
        merged_df = pd.merge(df_reco, item_df, on="itemID", how="inner")
        merged_df[self.unique_genres] = merged_df[self.unique_genres].div(
            merged_df[self.unique_genres].sum(axis=1), axis=0
        )

        reco_distribution = merged_df[["userID"] + self.unique_genres]
        
        
        reco_distribution = reco_distribution.groupby("userID")[
            self.unique_genres
        ].mean()  # this is essentially the precision if we consider genre instead of relevance


        g_reco_distribution = self.get_sensitive_attr_genre_dist(reco_distribution,sensitive_attr)

        # return self.genre_result(g_reco_distribution)
        return self.pairwise_abs_diff(g_reco_distribution)

    def get_sensitive_attr_genre_dist(self, user_reco,sensitive_attr):
        """
        user_reco : is the recommended genre distibution for all users
        """
        recomen_df = pd.merge(user_reco, self.gender_df, on="userID")
        sensitive_attr_genre_weights_r = recomen_df.groupby(sensitive_attr)[self.unique_genres].mean()
        
        distribution_sensitive_attr = sensitive_attr_genre_weights_r.sort_index()
        return distribution_sensitive_attr
        
    
    def pairwise_abs_diff(self,sensitive_attr_genre_dist):
        ret_val = 0
        genre_dist =[]
        for g in self.unique_genres:
            genre_pref = sensitive_attr_genre_dist[g].values
            g_dist = 0
            for si,sj in combinations(range(len(genre_pref)),2):
                # print(f"si {si} sj {sj}")
                val=genre_pref[si]-genre_pref[sj]
                ret_val+=abs(val)
                g_dist=g_dist+abs(val)
            genre_dist.append(g_dist)
        
        possible_comb= comb(len(sensitive_attr_genre_dist),2)
        
        
        
        return ret_val/possible_comb, np.array(genre_dist)/possible_comb

In [132]:
####IR Metrics####

top_k = 10
sensitive_attr="Occupation_Code"
###intialize them
gp = GenrePrecisionMulti(users, unique_genres, top_k)
val, dist=gp.compute(reco_matrix, movies, sensitive_attr)







                   Action  Thriller   Romance   Western  Children's   Mystery  \
Occupation_Code                                                                 
0                0.061350  0.076477  0.068629  0.008650    0.009916  0.020992   
1                0.039048  0.084524  0.062560  0.007143    0.006548  0.034226   
2                0.041667  0.073810  0.063095  0.014286    0.007143  0.014286   
3                0.054105  0.079211  0.065596  0.003509    0.008684  0.026228   
4                0.065000  0.076368  0.048458  0.007463    0.006219  0.019030   
5                0.085093  0.095833  0.056389  0.016667    0.005093  0.027315   
6                0.077240  0.096875  0.067240  0.004167    0.007812  0.023958   
7                0.064896  0.081250  0.059687  0.006250    0.021354  0.015625   
8                0.104286  0.094048  0.087619  0.000000    0.007143  0.017857   
9                0.059444  0.094444  0.065000  0.016667    0.005556  0.026389   
10               0.037157  0

In [133]:
sum(abs(dist))


0.22287142582228192

In [134]:
val

0.22287142582228225

In [123]:
unique_genres

['Action',
 'Thriller',
 'Romance',
 'Western',
 "Children's",
 'Mystery',
 'Fantasy',
 'Film-Noir',
 'Documentary',
 'Comedy',
 'Adventure',
 'Sci-Fi',
 'Horror',
 'Crime',
 'Musical',
 'War',
 'Animation',
 'Drama']