In [None]:
import gurobipy as gp
from gurobipy import GRB

In [57]:
# tahsin kheya
# last modified 11/12/2024
import pandas as pd
import os
import numpy as np

# import torch
# import time
from logging import getLogger
import random
import numpy as np
from scipy.stats import entropy

# from mpi4py import MPI
# comm = MPI.COMM_WORLD
# rank = comm.Get_rank()
rank = 0


class Calibration(object):
    def __init__(self, config, movies, top_k, unique_genres, users):
        # self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        # self.device = torch.device(self.device)
        self.reco_distribution = []
        self.kl = []
        self.top_k = top_k
        self.gkl = []
        self.gender_df = users
        self.actual_genre_dist = pd.read_csv(
            os.path.join(config["user_genre_dist_file"]),
            sep="\t",
        )
        self.actual_genre_dist = self.actual_genre_dist.sort_values(by="userID")
        self.unique_genres = unique_genres
        self.config = config
        self.item_df = movies
        self.actual_distribution_without_id = self.actual_genre_dist.drop(
            columns=["userID"]
        )
        self.actual_distribution_without_id = self.actual_distribution_without_id.drop(
            columns=["user_timestamp_sum"]
        ).to_numpy()
        self.actual_distribution_gender = []

    def get_all_user_recommended_genre_dist(self, topk_reco):
        df_reco = pd.DataFrame(
            {
                "userID": np.repeat(np.arange(topk_reco.shape[0]), self.top_k),
                "itemID": topk_reco.flatten(),
                "rank": np.tile(np.arange(1, self.top_k + 1), topk_reco.shape[0]),
            }
        )
        df_reco["weight_factor"] = 1 / (df_reco["rank"]) ** 0.1
        df_reco["weight_factor"] = (df_reco["rank"]) * 0 +1
        merged_df = pd.merge(df_reco, self.item_df, on="itemID", how="inner")
        merged_df[self.unique_genres] = merged_df[self.unique_genres].div(
            merged_df[self.unique_genres].sum(axis=1), axis=0
        )
        merged_df[self.unique_genres] = (
            merged_df["weight_factor"].values[:, None] * merged_df[self.unique_genres]
        )
        summed_genre = (
            merged_df.groupby("userID")[self.unique_genres].sum().reset_index()
        )
        weight_total = merged_df.groupby("userID")["weight_factor"].sum()
        summed_genre["user_weight_factor"] = summed_genre["userID"].map(weight_total)
        summed_genre = (
            merged_df.groupby("userID")[self.unique_genres].sum().reset_index()
        )
        summed_genre = summed_genre[self.unique_genres].div(
            summed_genre["user_weight_factor"], axis=0
        )

        return summed_genre

    def get_recom_distribution(self, reco, uid):
        reco = np.array(reco)
        df_reco = pd.DataFrame(
            {
                "userID": np.repeat(uid, len(reco)),
                "itemID": reco.flatten(),
                "rank": np.tile(np.arange(1, len(reco) + 1), 1),
            }
        )
        df_reco["weight_factor"] = 1 / (df_reco["rank"]) ** 0.1
        merged_df = pd.merge(df_reco, self.item_df, on="itemID", how="inner")

        merged_df[self.unique_genres] = merged_df[self.unique_genres].div(
            merged_df[self.unique_genres].sum(axis=1), axis=0
        )
        merged_df[self.unique_genres] = (
            merged_df["weight_factor"].values[:, None] * merged_df[self.unique_genres]
        )
        summed_genre = (
            merged_df.groupby("userID")[self.unique_genres].sum().reset_index()
        )

        weight_total = merged_df.groupby("userID")["weight_factor"].sum()
        summed_genre["user_weight_factor"] = summed_genre["userID"].map(weight_total)

        summed_genre = summed_genre[self.unique_genres].div(
            summed_genre["user_weight_factor"], axis=0
        )

        return summed_genre[self.unique_genres].to_numpy()

    def get_kl_div(self, q_dist, p_dist, a, uid):
        qg_u = (1 - a) * q_dist + a * p_dist
        kl_div = entropy(p_dist, qg_u, base=10)

        return kl_div

    def compute_diversity_score(self, reco_items, uid, l, scores, b):
        alpha = 0.01
        sum_score = 0

        # -----------------the calibration term----------------------------------

        reco_dist = self.get_recom_distribution(reco_items, uid)[0]
        kl = self.get_kl_div(
            reco_dist, self.actual_distribution_without_id[uid], alpha, uid
        )

        # -----------------the calibration term----------------------------------

        # -----------------the fairness term----------------------------------
        # recommended dist mean for each gender
        male_user_ids = self.gender_df[self.gender_df["Gender"] == 0]["userID"]
        male_user_ids = male_user_ids.to_list()
        gender_genre_dist = self.actual_distribution_gender
        if uid in male_user_ids:
            compare_dist = gender_genre_dist[self.unique_genres].to_numpy()[
                1
            ]  # this is the female avg genre pref
        else:
            compare_dist = gender_genre_dist[self.unique_genres].to_numpy()[0]

        gender_kl = self.get_kl_div(reco_dist, compare_dist, alpha, uid)

        # -----------------the fairness term----------------------------------

        for r in range(len(reco_items)):
            sum_score += scores[reco_items[r]]

        return (1 - b) * sum_score - b * gender_kl
        # return (1 - l - b) * sum_score - l * kl - b * gender_kl

    def get_improved_reco(self, top_items, items, scores):
        return self.get_new_recommendations(
            reco=top_items, scores=scores, all_items=items
        )

    def get_gender_genre_dist(self):
        actual_dist_gender = pd.merge(
            self.actual_genre_dist, self.gender_df, on="userID"
        )
        gender_genre_weights_a = actual_dist_gender.groupby("Gender")[
            self.unique_genres
        ].mean()
        self.actual_distribution_gender = gender_genre_weights_a.sort_index()

    def get_new_recommendations(self, reco, scores, all_items):
        """reco is 6040x50 and scores is 6040x3416"""
        self.get_gender_genre_dist()
        b = 0.69  # beta for the fairness term
        l = 0.29  # lambda for the calibration term
        all_users = []
        top_k = self.top_k
        num_users = len(scores)
        # for u in range(rank*2,rank*2+2):
        upper_bound = min(num_users, rank * 2 + 2)
        for u in range(rank * 2, upper_bound):
            remaining_items = list(range(10))
            # remaining_items = all_items
            u_calibrated = []
            for k in range(top_k):
                diversity_scores = [
                    self.compute_diversity_score(u_calibrated + [i], u, l, scores[u], b)
                    for i in remaining_items
                ]
                max_index = np.argmax(diversity_scores)

                best_item = remaining_items[max_index]
                u_calibrated.append(best_item)
                remaining_items.pop(max_index)
                print(u_calibrated)
            print(f"user {u} u_calibrate {u_calibrated}")

            all_users.append(u_calibrated)

        return np.array(all_users)

In [58]:
import numpy as np
import pandas as pd

# from calibration.Calibration import Calibration
# from mpi4py import MPI
import csv
import time
import os

# load rankings
# print(pwd)
print(os.getcwd())
start_time = time.time()
print("Loading data...")
data_load_start = time.time()
reco_matrix = np.load("test/reco_matrix.npy")[0]
# scores = np.load("reco_matrix_mapped_scores.npy")[0]
print(reco_matrix.shape)
# print(scores.shape)
import pickle

with open("test/score_dicts.pkl", "rb") as file:
    scores = pickle.load(file)
print(f"{os.getcwd()} yo yo ")

# print("Loaded List of OrderedDicts:", scores)
df_m = pd.read_csv(
    "data/ml-100k/u.item",
    sep="|",
    names=[
        "movieID",
        "Name",
        "Date",
        "Video_Date",
        "IMDB_URL",
        "unknown",
        "Action",
        "Adventure",
        "Animation",
        "Children's",
        "Comedy",
        "Crime",
        "Documentary",
        "Drama",
        "Fantasy",
        "Film-Noir",
        "Horror",
        "Musical",
        "Mystery",
        "Romance",
        "Sci-Fi",
        "Thriller",
        "War",
        "Western",
    ],
    header=None,
    encoding="latin-1",
)
print(df_m.shape)
df_m = df_m[
    [
        "movieID",
        "Action",
        "Adventure",
        "Animation",
        "Children's",
        "Comedy",
        "Crime",
        "Documentary",
        "Drama",
        "Fantasy",
        "Film-Noir",
        "Horror",
        "Musical",
        "Mystery",
        "Romance",
        "Sci-Fi",
        "Thriller",
        "War",
        "Western",
    ]
]

df_movies_mapped = pd.read_csv(
    "data/ml-100k/i_id_mapping.csv",
    sep="\t",
    names=["movieID", "itemID"],
    header=None,
    encoding="latin-1",
)
movies = pd.merge(df_m, df_movies_mapped, how="inner", on="movieID")
movies = movies.drop(columns=["movieID"])
movies = movies.sort_values(by="itemID")
unique_genres = [
    "Action",
    "Thriller",
    "Romance",
    "Western",
    "Children's",
    "Mystery",
    "Fantasy",
    "Film-Noir",
    "Documentary",
    "Comedy",
    "Adventure",
    "Sci-Fi",
    "Horror",
    "Crime",
    "Musical",
    "War",
    "Animation",
    "Drama",
]
genre = movies[unique_genres]
item_features_numpy = genre.to_numpy()

users = pd.read_csv("data/ml-100k/u_id_mapping.csv", sep="\t")

users = users.sort_values(by="userID")

users = users.drop(columns=users.columns[0])
gender_map = {"M": 0, "F": 1}
users["Gender"] = users["Gender"].map(gender_map)
user_features_numpy = users.to_numpy()


def create_genre_column(r):
    all_genres = [g for g in unique_genres if r[g] == 1]
    return "|".join(all_genres)


movies["genres"] = movies.apply(create_genre_column, axis=1)
movies
item_ids = movies["itemID"].to_numpy()
item_ids
top_k = 10

data_load_end = time.time()
print(f"Data loaded in {data_load_end - data_load_start:.2f} seconds.")

calibration_start = time.time()

config = {"user_genre_dist_file": "data/ml-100k/pgui.csv"}
calibration = Calibration(config, movies, top_k, unique_genres, users)
reranked_reco = calibration.get_improved_reco(reco_matrix, list(item_ids), scores)

calibration_end = time.time()

/Users/tahsinalamgirkheya/Desktop/work/reranking_fairnes
Loading data...
(943, 10)
/Users/tahsinalamgirkheya/Desktop/work/reranking_fairnes yo yo 
(1682, 24)
Data loaded in 0.98 seconds.
[1]
[1, 3]
[1, 3, 5]
[1, 3, 5, 6]
[1, 3, 5, 6, 0]
[1, 3, 5, 6, 0, 9]
[1, 3, 5, 6, 0, 9, 7]
[1, 3, 5, 6, 0, 9, 7, 8]
[1, 3, 5, 6, 0, 9, 7, 8, 4]
[1, 3, 5, 6, 0, 9, 7, 8, 4, 2]
user 0 u_calibrate [1, 3, 5, 6, 0, 9, 7, 8, 4, 2]
[6]
[6, 9]
[6, 9, 0]
[6, 9, 0, 7]
[6, 9, 0, 7, 5]
[6, 9, 0, 7, 5, 1]
[6, 9, 0, 7, 5, 1, 3]
[6, 9, 0, 7, 5, 1, 3, 8]
[6, 9, 0, 7, 5, 1, 3, 8, 4]
[6, 9, 0, 7, 5, 1, 3, 8, 4, 2]
user 1 u_calibrate [6, 9, 0, 7, 5, 1, 3, 8, 4, 2]


In [36]:
users

Unnamed: 0,Gender,userID
0,0,0
1,1,1
2,0,2
3,0,3
4,0,4
...,...,...
938,1,938
939,0,939
940,1,940
941,1,941


In [56]:
category_contributions['Action']

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0]

In [68]:
import gurobipy as gp
from gurobipy import GRB

# Sample Data
items = ["item1", "item2", "item3", "item4", "item5"]
items = list(range(10))
utilities = [0.9, 0.8, 0.7, 0.6, 0.5]

category_contributions = genre_proportions
user_preferences = {
    "Action": 0.0192307728929663,
    "Thriller": 0.0,
    "Romance": 0.1196581173028242,
    "Western": 0.0,
    "Children's": 0.0213675279518291,
    "Mystery": 0.0,
    "Fantasy": 0.0064102590082061,
    "Film-Noir": 0.0,
    "Documentary": 0.0256410084787781,
    "Comedy": 0.5769231040347129,
    "Adventure": 0.0277777820233898,
    "Sci-Fi": 0.0128205142848321,
    "Horror": 0.0,
    "Crime": 0.0,
    "Musical": 0.0,
    "War": 0.0149572615168326,
    "Animation": 0.0,
    "Drama": 0.1752136525056284,
}
N = 10  # Number of items to recommend

# Initialize Model
model = gp.Model("calibrated_reranking")

# Decision Variables
x = model.addVars(len(items), vtype=GRB.BINARY, name="x")

def calculate_genre_kl_div(reco):
    ret_val = []
    for cat in unique_genres:
        cat_contribution_total = gp.quicksum(category_contributions[cat][i] * x[i] for i in range(len(items)))
        ret_val.append(cat_contribution_total)
    return ret_val    


reco_dist = calculate_genre_kl_div(x)
print(reco_dist)
compare_dist = [0.08069637, 0.0985446 , 0.10065584, 0.00482467, 0.02913459,
       0.0266634 , 0.00319161, 0.00594649, 0.00477254, 0.18218506,
       0.03832242, 0.03543644, 0.0272521 , 0.03123977, 0.02006362,
       0.03325148, 0.01139311, 0.26638579]
a=0.01
compare_dist = [
    (1 - a) * compare_dist[i] + a * reco_dist[i] for i in range(len(compare_dist))
]
gender_kl = gp.quicksum(
    reco_dist[i] * math.log(reco_dist[i] / compare_dist[i])
    for i in range(len(compare_dist))
)
model.setObjective(
    (1 - 0.69) * gp.quicksum(scores[0][i] * x[i] for i in range(len(items)))
    - 0.69
    * gender_kl,
    GRB.MAXIMIZE,
)

# Total Recommendation Constraint
model.addConstr(
    gp.quicksum(x[i] for i in range(len(items))) == N, name="total_recommendations"
)

# Optimize
model.optimize()

# Display Results
if model.status == GRB.OPTIMAL:
    selected_items = [items[i] for i in range(len(items)) if x[i].x > 0.5]
    print("Selected items:", selected_items)
    print("Total Utility:", model.objVal)
else:
    print("No optimal solution found.")

[<gurobi.LinExpr: 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.5 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*>>, <gurobi.LinExpr: 0.0 <gurobi.Var *Awaiting Model Update*> + 0.25 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.5 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*>>, <gurobi.LinExpr: 0.0 <gurobi.Var *Awaiting Model Update*> + 0.0 <gurobi.Var *Awaiting Model Update*> + 0

TypeError: must be real number, not gurobipy._core.NLExpr

In [67]:
import math
actual_genre_dist = pd.read_csv(
    os.path.join(config["user_genre_dist_file"]),
    sep="\t",
)
actual_genre_dist

Unnamed: 0,userID,Action,Thriller,Romance,Western,Children's,Mystery,Fantasy,Film-Noir,Documentary,Comedy,Adventure,Sci-Fi,Horror,Crime,Musical,War,Animation,Drama,user_timestamp_sum
0,0,0.019231,0.000000,0.119658,0.000000,0.021368,0.000000,0.006410,0.000000,0.025641,0.576923,0.027778,0.012821,0.000000,0.000000,0.000000,0.014957,0.000000,0.175214,34368817581
1,1,0.159306,0.276137,0.030835,0.027148,0.041044,0.043481,0.005775,0.009065,0.000000,0.073911,0.052958,0.031625,0.010828,0.053462,0.013535,0.007219,0.015700,0.147972,81181348894
2,2,0.189453,0.066927,0.053255,0.011719,0.031250,0.007813,0.001953,0.003906,0.000000,0.354297,0.086328,0.069141,0.010937,0.019141,0.009375,0.040885,0.000000,0.043620,112497575458
3,3,0.076864,0.056048,0.093108,0.005274,0.013010,0.009494,0.002461,0.004219,0.004219,0.298312,0.027848,0.033826,0.015401,0.013502,0.015823,0.032982,0.016667,0.280942,208703517034
4,4,0.129167,0.220833,0.079167,0.000000,0.050000,0.037500,0.000000,0.000000,0.000000,0.200000,0.000000,0.041667,0.041667,0.025000,0.000000,0.041667,0.000000,0.133333,17727953698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,938,0.146258,0.110544,0.096939,0.000000,0.004082,0.006803,0.005102,0.000000,0.000000,0.197959,0.072109,0.056122,0.020408,0.023810,0.004082,0.023810,0.004082,0.227891,43132807063
939,939,0.071223,0.086930,0.094604,0.000000,0.013189,0.023381,0.000000,0.000000,0.014388,0.242806,0.029856,0.034053,0.009592,0.044365,0.009592,0.009472,0.002398,0.314149,123269730515
940,940,0.114286,0.063492,0.107672,0.000000,0.015873,0.005291,0.003968,0.000000,0.000000,0.250000,0.052116,0.048148,0.007936,0.010582,0.023810,0.046825,0.013228,0.236772,55410714285
941,941,0.066667,0.186667,0.070000,0.000000,0.000000,0.076667,0.000000,0.023333,0.000000,0.060000,0.020000,0.033333,0.043333,0.066667,0.000000,0.023333,0.000000,0.330000,22105499005


In [44]:
get_recom_distribution([1,0,1],0)

array([[0.        , 0.        , 0.        , 0.        , 0.24133913,
        0.        , 0.        , 0.        , 0.        , 0.75866087,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]])

In [18]:
values = actual_genre_dist.iloc[0, 1:19].to_list()
genres_dict = dict(zip(unique_genres, values))
genres_dict

{'Action': 0.0192307728929663,
 'Thriller': 0.0,
 'Romance': 0.1196581173028242,
 'Western': 0.0,
 "Children's": 0.0213675279518291,
 'Mystery': 0.0,
 'Fantasy': 0.0064102590082061,
 'Film-Noir': 0.0,
 'Documentary': 0.0256410084787781,
 'Comedy': 0.5769231040347129,
 'Adventure': 0.0277777820233898,
 'Sci-Fi': 0.0128205142848321,
 'Horror': 0.0,
 'Crime': 0.0,
 'Musical': 0.0,
 'War': 0.0149572615168326,
 'Animation': 0.0,
 'Drama': 0.1752136525056284}

In [53]:
def get_recom_distribution(reco, uid):
    for i in range(len(x)):
        print(x[i])
    reco = np.array(reco)
    reco = np.where(reco > 0)[0]
    df_reco = pd.DataFrame(
        {
            "userID": np.repeat(uid, len(reco)),
            "itemID": reco.flatten(),
            "rank": np.tile(np.arange(1, len(reco) + 1), 1),
        }
    )
    df_reco["weight_factor"] = 1 / (df_reco["rank"]) ** 0.1
    merged_df = pd.merge(df_reco, movies, on="itemID", how="inner")

    merged_df[unique_genres] = merged_df[unique_genres].div(
        merged_df[unique_genres].sum(axis=1), axis=0
    )
    merged_df[unique_genres] = (
        merged_df["weight_factor"].values[:, None] * merged_df[unique_genres]
    )
    summed_genre = (
        merged_df.groupby("userID")[unique_genres].sum().reset_index()
    )

    weight_total = merged_df.groupby("userID")["weight_factor"].sum()
    summed_genre["user_weight_factor"] = summed_genre["userID"].map(weight_total)

    summed_genre = summed_genre[unique_genres].div(
        summed_genre["user_weight_factor"], axis=0
    )

    return summed_genre[unique_genres].to_numpy()

In [26]:
df_movies = movies

df_movies[unique_genres] = df_movies[unique_genres].div(
    df_movies[unique_genres].sum(axis=1), axis=0
)
df_movies

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,itemID,genres
240,0.0,0.0,0.0,0.0,1.0,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,0.00,0,Comedy
300,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.00,0.0,0.25,0.0,0.0,0.25,0.00,0.0,0.25,0.00,0.00,1,Thriller|Mystery|Film-Noir|Crime
375,0.0,0.0,0.0,0.5,0.5,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,0.00,2,Children's|Comedy
50,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.25,0.0,0.00,0.0,0.0,0.00,0.25,0.0,0.00,0.25,0.25,3,Romance|Western|War|Drama
344,0.0,0.0,0.0,0.0,0.0,0.50,0.0,0.50,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,0.00,4,Crime|Drama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1248,0.0,0.0,0.0,0.0,0.0,0.00,0.0,1.00,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,0.00,1344,Drama
1192,0.0,0.0,0.0,0.0,1.0,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,0.00,1345,Comedy
1176,0.0,0.0,0.0,0.0,0.0,0.00,0.0,1.00,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,0.00,1346,Drama
1261,0.0,0.0,0.0,0.0,0.0,0.00,0.0,1.00,0.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,0.00,1347,Drama


In [32]:
genre_proportions = {genre: [] for genre in df_movies.columns[:-2]}
print(genre_proportions)

for _, row in df_movies[:10].iterrows():
    for genre in genre_proportions:
        genre_proportions[genre].append(row[genre])

# Display the resulting dictionary
genre_proportions

{'Action': [], 'Adventure': [], 'Animation': [], "Children's": [], 'Comedy': [], 'Crime': [], 'Documentary': [], 'Drama': [], 'Fantasy': [], 'Film-Noir': [], 'Horror': [], 'Musical': [], 'Mystery': [], 'Romance': [], 'Sci-Fi': [], 'Thriller': [], 'War': [], 'Western': []}


{'Action': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0],
 'Adventure': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.3333333333333333,
  0.0,
  0.0],
 'Animation': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 "Children's": [0.0,
  0.0,
  0.5,
  0.0,
  0.0,
  0.0,
  0.0,
  0.3333333333333333,
  0.0,
  0.0],
 'Comedy': [1.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0],
 'Crime': [0.0, 0.25, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0],
 'Documentary': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'Drama': [0.0, 0.0, 0.0, 0.25, 0.5, 0.0, 0.0, 0.0, 0.0, 1.0],
 'Fantasy': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'Film-Noir': [0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'Horror': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'Musical': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0],
 'Mystery': [0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'Romance': [0.0,
  0.0,
  0.0,
  0.25,
  0.0,
  0.0,
  0.0,
 

In [46]:
user_merged=pd.merge(actual_genre_dist, users, on ="userID", how ="inner")
user_merged=user_merged.groupby("Gender")[
            unique_genres
        ].mean()
user_merged=user_merged.sort_index()


In [50]:
user_merged.iloc[1].to_numpy()

array([0.08069637, 0.0985446 , 0.10065584, 0.00482467, 0.02913459,
       0.0266634 , 0.00319161, 0.00594649, 0.00477254, 0.18218506,
       0.03832242, 0.03543644, 0.0272521 , 0.03123977, 0.02006362,
       0.03325148, 0.01139311, 0.26638579])

In [41]:
x= [1,0,0,1]
y= np.array(x)
indices = np.where(y > 0)[0]
indices

array([0, 3])

In [49]:
170000/5

34000.0

In [69]:
addGenConstrLog

NameError: name 'addGenConstrLog' is not defined

In [78]:
import gurobipy as gp
from gurobipy import GRB

# Sample Data
items = ["item1", "item2", "item3", "item4", "item5"]
items = list(range(10))
utilities = [0.9, 0.8, 0.7, 0.6, 0.5]

category_contributions = genre_proportions
user_preferences = {
    "Action": 0.0192307728929663,
    "Thriller": 0.0,
    "Romance": 0.1196581173028242,
    "Western": 0.0,
    "Children's": 0.0213675279518291,
    "Mystery": 0.0,
    "Fantasy": 0.0064102590082061,
    "Film-Noir": 0.0,
    "Documentary": 0.0256410084787781,
    "Comedy": 0.5769231040347129,
    "Adventure": 0.0277777820233898,
    "Sci-Fi": 0.0128205142848321,
    "Horror": 0.0,
    "Crime": 0.0,
    "Musical": 0.0,
    "War": 0.0149572615168326,
    "Animation": 0.0,
    "Drama": 0.1752136525056284,
}
N = 10  # Number of items to recommend

# Initialize Model
model = gp.Model("calibrated_reranking")

# Decision Variables
x = model.addVars(len(items), vtype=GRB.BINARY, name="x")

def calculate_genre_kl_div(reco):
    ret_val = []
    for cat in unique_genres:
        cat_contribution_total = gp.quicksum(category_contributions[cat][i] * x[i] for i in range(len(items)))
        ret_val.append(cat_contribution_total)
    return ret_val    


reco_dist = calculate_genre_kl_div(x)
print(reco_dist)
compare_dist = [0.08069637, 0.0985446 , 0.10065584, 0.00482467, 0.02913459,
       0.0266634 , 0.00319161, 0.00594649, 0.00477254, 0.18218506,
       0.03832242, 0.03543644, 0.0272521 , 0.03123977, 0.02006362,
       0.03325148, 0.01139311, 0.26638579]
a=0.01
compare_dist = [
    (1 - a) * compare_dist[i] + a * reco_dist[i] for i in range(len(compare_dist))
]
gender_kl = gp.quicksum(
    reco_dist[i] * math.log(reco_dist[i] / compare_dist[i])
    for i in range(len(compare_dist))
)
model.setObjective(
    (1 - 0.69) * gp.quicksum(scores[0][i] * x[i] for i in range(len(items)))
    - 0.69
    * gender_kl,
    GRB.MAXIMIZE,
)

# Total Recommendation Constraint
model.addConstr(
    gp.quicksum(x[i] for i in range(len(items))) == N, name="total_recommendations"
)

# Optimize
model.optimize()

# Display Results
if model.status == GRB.OPTIMAL:
    selected_items = [items[i] for i in range(len(items)) if x[i].x > 0.5]
    print("Selected items:", selected_items)
    print("Total Utility:", model.objVal)
else:
    print("No optimal solution found.")

TypeError: 1.0 is not a variable

In [80]:
import gurobipy as gp
from gurobipy import GRB

import math

model = gp.Model()

d12 = model.addVar(name="d12")
# Make sure to have the tightest possible lower and upper bounds for 
# variables appearing in the general constraints
w1 = model.addVar(lb=1, ub=3, name="w1")
w2 = model.addVar(lb=1, ub=3, name="w2")
# u1 = ln(w1)
u1 = model.addVar(name="u1")
# u2 = ln(w2)
u2 = model.addVar(name="u2")

model.addGenConstrLog(w1, u1, name="gen0")
model.addGenConstrLog(w2, u2, name="gen1")
model.addConstr(u1 - u2 + d12 == math.log(2), name="c0")

<gurobi.Constr *Awaiting Model Update*>

In [None]:
import gurobipy as gp
import math

m = gp.Model()
x = m.addVar(name="x", ub=1)
y = m.addVar(name="y", lb=-gp.GRB.INFINITY)

# Create x-points and y-points for approximation of y = x*log(x)
xs = [0.01*i for i in range(101)]
ys = [p*math.log(p) if p != 0 else 0 for p in xs]

# Add piecewise-linear constraint
m.addGenConstrPWL(x, y, xs, ys, "pwl")

# Minimize approximation of y = x*log(x)
m.setObjective(y, gp.GRB.MINIMIZE)
m.optimize()

# ri=pi logpi

In [89]:
items

[0, 1, 2, 3, 4]

In [102]:
# Example data
scores = [10, 20, 15, 12, 18]  # Example scores for each item
p_items = [
    [0.2, 0.5, 0.3],  # Item 1 category distribution
    [0.1, 0.7, 0.2],  # Item 2 category distribution
    [0.4, 0.4, 0.2],  # Item 3 category distribution
    [0.3, 0.3, 0.4],  # Item 4 category distribution
    [0.5, 0.2, 0.3]   # Item 5 category distribution
]
items = list(range(5))

compare_dist = [0.1, 0.4, 0.5]  # Recommended category distribution


N = 4

# Create a Gurobi model
model = gp.Model("maximize_score_with_kld")

# Decision variables: xi for each item (binary)
xi = model.addVars(len(items), vtype=GRB.BINARY, name="xi")

# KLD for each item
def calculate_kld(p_item, gen_rec):
    kld = 0
    for j in range(len(p_item)):
        if p_item[j] > 0:  # Avoid log(0) errors
            kld += p_item[j] * np.log(p_item[j] / gen_rec[j])
    return kld
def calculate_genre_kl_div(reco):
 
    ret_val = []
    for cat in range(len(compare_dist)):
        cat_contribution_total = gp.quicksum(p_items[i][cat] * reco[i] for i in range(len(items)))
        ret_val.append(cat_contribution_total)
    return ret_val 

p_r_items =calculate_genre_kl_div(xi)
print(p_r_items.__len__())
kld_terms=calculate_kld(p_r_items, compare_dist) * xi[i]

# Objective function: Maximize score - KLD
objective = gp.quicksum(scores[0][i] * xi[i] for i in range(N)) - kld_terms
model.setObjective(objective, GRB.MAXIMIZE)
# Optimize the model
model.optimize()

# Output the results
if model.status == GRB.OPTIMAL:
    print("Optimal Solution:")
    for i in range(N):
        print(f"Item {i + 1} selected: {xi[i].x}")
    print(f"Optimal Objective Value: {model.objVal}")
else:
    print("No optimal solution found.")


3


NotImplementedError: 

In [83]:
import gurobipy as gp
from gurobipy import GRB
import numpy as np

# Example data
scores = [10, 20, 15, 12, 18]  # Example scores for each item
p_items = [
    [0.2, 0.5, 0.3],  # Item 1 category distribution
    [0.1, 0.7, 0.2],  # Item 2 category distribution
    [0.4, 0.4, 0.2],  # Item 3 category distribution
    [0.3, 0.3, 0.4],  # Item 4 category distribution
    [0.5, 0.2, 0.3]   # Item 5 category distribution
]
items = list(range(5))  # Items are indexed from 0 to 4
compare_dist = [0.1, 0.4, 0.5]  # Recommended category distribution

N = len(items)

# Create a Gurobi model
model = gp.Model("maximize_score_with_kld")

# Decision variables: xi for each item (binary)
xi = model.addVars(len(items), vtype=GRB.BINARY, name="xi")

# KLD for each item
def calculate_kld(p_item, gen_rec):
    kld = 0
    for j in range(len(p_item)):
        if p_item[j] > 0:  # Avoid log(0) errors
            kld += p_item[j] * np.log(p_item[j] / gen_rec[j])
    return kld

# Calculate the "aggregated" category distribution for selected items
def calculate_reco_p_items():
    reco_p = np.zeros(len(compare_dist))  # Initialize the recommended category distribution for selected items
    for i in range(N):
        if xi[i].x > 0.5:  # If item i is selected (xi[i] is 1)
            reco_p += np.array(p_items[i])  # Add the distribution of item i to reco_p
    return reco_p / np.sum(reco_p)  # Normalize to make sure the distribution sums to 1

# Add KLD term to objective for each item
kld_terms = []
for i in range(N):
    # Calculate the KLD for the "aggregated" selected items distribution
    reco_p_items = calculate_reco_p_items()  # Recompute the recommended distribution for selected items
    kld_terms.append(calculate_kld(p_items[i], compare_dist) * xi[i])  # Add KLD of each selected item

# Objective function: Maximize score - KLD
objective = gp.quicksum(scores[i] * xi[i] for i in range(N)) - gp.quicksum(kld_terms)
model.setObjective(objective, GRB.MAXIMIZE)

# Optimize the model
model.optimize()

# Output the results
if model.status == GRB.OPTIMAL:
    print("Optimal Solution:")
    for i in range(N):
        print(f"Item {i + 1} selected: {xi[i].x}")
    print(f"Optimal Objective Value: {model.objVal}")
else:
    print("No optimal solution found.")


AttributeError: Index out of range for attribute 'X'

In [107]:
import gurobipy as gp
from gurobipy import GRB
import numpy as np

# Example data
scores = [10, 20, 15, 12, 18]  # Example scores for each item
p_items = [
    [0.2, 0.5, 0.3],  # Item 1 category distribution
    [0.1, 0.7, 0.2],  # Item 2 category distribution
    [0.4, 0.4, 0.2],  # Item 3 category distribution
    [0.3, 0.3, 0.4],  # Item 4 category distribution
    [0.5, 0.2, 0.3]   # Item 5 category distribution
]
items = list(range(5))

compare_dist = [0.1, 0.4, 0.5]  # Recommended category distribution

N = 5

# Create a Gurobi model
model = gp.Model("maximize_score_with_kld")

# Decision variables: xi for each item (binary)
xi = model.addVars(len(items), vtype=GRB.BINARY, name="xi")
c1 = model.addVar(name="c1")
c2 = model.addVar(name="c2")
c3 = model.addVar(name="c3")



# Function to calculate KLD terms using log constraints
def calculate_kld(p_item, reco):
    kld_terms = []
    for j in range(len(p_item)):
        if reco[j] > 0:  # Avoid log(0) errors
            # Create the sum expression for the log term
            log_expr = gp.quicksum(p_item[i] * xi[i] for i in range(len(p_item))) / reco[j]
            # Use addGenConstrLog to create the constraint (log expression)
            model.addGenConstrLog(c1, log_expr, GRB.CONTINUOUS, f"log_term_{j}")
            # kld_terms.append(log_term)
    return kld_terms

# Generate the KLD contributions for the recommended distribution
def calculate_genre_kl_div(reco):
    ret_val = []
    for cat in range(len(compare_dist)):
        # Use Gurobi quicksum to handle the summation of variables
        cat_contribution_total = gp.quicksum(p_items[i][cat] * reco[i] for i in range(len(items)))
        ret_val.append(cat_contribution_total)
    return ret_val

# Calculate the genre-based KLD terms
p_r_items = calculate_genre_kl_div(xi)

# Objective function: Maximize score - KLD
objective_terms = gp.quicksum(scores[i] * xi[i] for i in range(N))

# KLD term calculation for the total KLD (note that KLD is now a sum of constraints)
kld_terms = []
for i in range(N):
    # Use the modified KLD function for each item
    kld_terms += calculate_kld(p_r_items, compare_dist)

# The final objective function (Note: KLD terms are constraints, not part of the objective)
objective = objective_terms  # We are maximizing the scores only

# Add constraints from KLD terms (these constraints will implicitly reduce the score)
model.setObjective(objective, GRB.MAXIMIZE)

# Optimize the model
model.optimize()

# Output the results
if model.status == GRB.OPTIMAL:
    print("Optimal Solution:")
    for i in range(N):
        print(f"Item {i + 1} selected: {xi[i].x}")
    print(f"Optimal Objective Value: {model.objVal}")
else:
    print("No optimal solution found.")


TypeError: 0.0 + [ 2.0 <gurobi.Var *Awaiting Model Update*> ^ 2 + <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> + 4.0 <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> + 3.0 <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> + 5.0 <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> + 5.0 <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> + 7.0 <gurobi.Var *Awaiting Model Update*> ^ 2 + 4.0 <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> + 3.0 <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> + 2.0 <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> + 3.0 <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> + 2.0 <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> + 2.0 <gurobi.Var *Awaiting Model Update*> ^ 2 + 4.0 <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> + 3.0 <gurobi.Var *Awaiting Model Update*> * <gurobi.Var *Awaiting Model Update*> ] is not a variable

In [108]:
import gurobipy as gp
from gurobipy import GRB
import numpy as np

# Assume some sample data
N = 10  # Number of items
M = 3   # Number of categories (A, B, C)
scores = np.random.rand(N)  # Random item scores
category_proportions = np.random.rand(N, M)  # Random category proportions for each item
category_proportions /= category_proportions.sum(axis=1, keepdims=True)  # Normalize each row

# Constant category distribution
constant_category_dist = np.array([0.1, 0.4, 0.5])

# Create a Gurobi model
model = gp.Model()

# Define binary variables xi (1 if item i is selected, 0 otherwise)
xi = model.addVars(N, vtype=GRB.BINARY, name="xi")

# Define the objective function: maximize the score and minimize the KLD
lambda_kld = 1  # You can adjust this as necessary

# Define KLD components: KLD = sum(P(c) * log(P(c)/Q(c)))
# We need to calculate P(c) based on the selected items.
# This will be a set of constraints to define the recommended category distribution.

# For each category c, sum up the contributions from selected items
category_sums = model.addVars(M, name="category_sums")  # Sum of each category

# The constraints to define the recommended category distribution
for c in range(M):
    model.addConstr(category_sums[c] == gp.quicksum(xi[i] * category_proportions[i, c] for i in range(N)))

# Normalize the recommended category distribution
model.addConstr(gp.quicksum(category_sums[c] for c in range(M)) == 1)

# Define the KLD part of the objective
kld_terms = gp.quicksum(category_sums[c] * np.log(category_sums[c] / constant_category_dist[c]) for c in range(M))

# Define the objective function: maximize score and minimize KLD
model.setObjective(gp.quicksum(scores[i] * xi[i] for i in range(N)) - lambda_kld * kld_terms, GRB.MAXIMIZE)

# Optimize the model
model.optimize()

# Extract the results
if model.status == GRB.OPTIMAL:
    selected_items = [i for i in range(N) if xi[i].x > 0.5]
    print("Selected items:", selected_items)
    print("Objective value:", model.objVal)
else:
    print("No optimal solution found.")


TypeError: loop of ufunc does not support argument 0 of type gurobipy._core.LinExpr which has no callable log method

In [109]:
import gurobipy as gp
import math

# Define the z values (for each x)
z_values = [0.1, 0.2, 0.7]

# Define p_items (category distribution for each item)
p_items = [
    [0.2, 0.5, 0.3],  # Item 1 category distribution
    [0.1, 0.7, 0.2],  # Item 2 category distribution
    [0.4, 0.4, 0.2],  # Item 3 category distribution
    [0.3, 0.3, 0.4],  # Item 4 category distribution
    [0.5, 0.2, 0.3]   # Item 5 category distribution
]

# Number of categories (3 categories)
num_categories = 3
num_items = len(p_items)

# Create the optimization model
m = gp.Model()

# Define the binary decision variables for item selection
reco = m.addVars(num_items, vtype=gp.GRB.BINARY, name="xi")

# Define the x values (the contributions for each category)
x_vars = m.addVars(num_categories, vtype=gp.GRB.CONTINUOUS, name="x")

# Objective: Minimize y = sum(x_i * log(x_i / z_i))
objective = gp.quicksum(
    x_vars[i] * gp.log(x_vars[i] / z_values[i]) if x_vars[i] > 0 else 0
    for i in range(num_categories)
)

# Set the objective function
m.setObjective(objective, gp.GRB.MINIMIZE)

# Constraints:
# 1. The x_i values are calculated from the selected items based on p_items and reco variables
for i in range(num_categories):
    m.addConstr(x_vars[i] == gp.quicksum(p_items[j][i] * reco[j] for j in range(num_items)))

# Optionally, you can add further constraints based on your specific problem, like limiting the number of items selected.

# Optimize the model
m.optimize()

# Output results
if m.status == gp.GRB.OPTIMAL:
    print("Optimal value:", m.objVal)
    print("Selected items:")
    for i in range(num_items):
        if reco[i].x > 0.5:  # Check if the item is selected (reco[i] is binary)
            print(f"Item {i+1} selected")

    print("x values (category contributions):")
    for i in range(num_categories):
        print(f"Category {i+1}: {x_vars[i].x}")
else:
    print("No optimal solution found")


TypeError: '>' not supported between instances of 'Var' and 'int'