*Data management course*

In [58]:
from typing import List
import math
import heapq
import random


In [59]:
def offer(heap: List[int], x:int, max_size):
    if max_size == 0:
        return
    if len(heap) < max_size:
        heapq.heappush(heap, x)
    else:
        min_item = heap[0]
        if x > min_item:
            heapq.heappop(heap)
            heapq.heappush(heap, x)

In [60]:
class Item:
    def __init__(self, name: str, category: int, score: float):
        self.name = name
        self.category= category
        self.score = score

    def __str__(self):
        return f"name={self.name}, category={self.category}, score={self.score}"

    def __lt__(self, other):
        return self.score < other.score



In [61]:
def algo1(I: List[Item], K:int, d:int, floors:List[int], ceilings: List[int]) -> List[Item]:
    """
    Diverse top-k selection from a sorted list.
    Args:
        I:          List of items sorted by score
        K:          Number of items to select
        d:          Number of categories
        floors:     Constraints floor_i ≤ k_i for each i ∈ [1. . .d].
        ceilings:   Constraints k_i ≤ ceil_i for each i ∈ [1. . .d].

    Returns:
        L top K  chosen items from list I.
    """
    L = []
    C = [0] * d
    slack = K - sum(floors)
    iterator = iter(I)
    while len(L) < K:
        x = next(iterator)
        i = x.category
        if C[i] < floors[i]:
            L.append(x)
            C[i] += 1
        elif (C[i] < ceilings[i]) and (slack > 0):
            L.append(x)
            C[i] +=1
            slack -= 1
    return L


In [73]:
def algo2(I: List[Item], K:int, d:int, floors:List[int], ceilings: List[int], items_per_category: List[int]) -> List[Item]:
    """
    Diverse top-k selection from a sorted list.
    Args:
        I:                  Stream of items.
        K:                  Number of items to select
        d:                  Number of categories
        floors:             Constraints floor_i ≤ k_i for each i ∈ [1. . .d].
        ceilings:           Constraints k_i ≤ ceil_i for each i ∈ [1. . .d].
        items_per_category: n_i for i ∈[1 . . .d].

    Returns:
        L top K  chosen items from list I.
    """
    num_feasible_items = lambda : sum(items_per_category[i] - M[i] for i in range(len(items_per_category)))

    N = len(I)
    L = []
    C = [0] * d
    M = [0] * d
    R = [math.floor(n / math.e) for n in items_per_category]
    T_i = [[] for _ in floors]
    slack = K - sum(floors)
    r = math.floor(N / math.e)
    T = []
    iterator = iter(I)
    while len(L) < K:
        x = next(iterator)
        i = x.category
        sum_m = sum(M)
        if sum_m < r:
            # heapq.heappush(T, x)
            offer(T, x, slack)
        if M[i] < R[i]:
            # print(f"Pushing {x} to T_i[{i}]")
            # heapq.heappush(T_i[i], x)
            offer(T_i[i], x, floors[i])
        elif ((C[i] < floors[i]) and (x.score > T_i[i][0].score)) or items_per_category[i] - M[i] == floors[i] - C[i]:
            heapq.heappop(T_i[i])
            L.append(x)
            C[i] += 1
        elif (sum_m >= r) and (x.score > T[0].score and (C[i] < ceilings[i]) and (slack > 0)):
            heapq.heappop(T)
            L.append(x)
            C[i] += 1
            slack -= 1
        elif (C[i] < ceilings[i]) and num_feasible_items() == K - len(L):
            L.append(x)
            C[i] += 1
            slack -= 1
        M[i] += 1
    return L


In [63]:
def algo3(I: List[Item], K:int, d:int, floors:List[int], ceilings: List[int], items_per_category: List[int]) -> List[Item]:
    """
    Diverse top-k selection from a sorted list.
    Args:
        I:                  Stream of items.
        K:                  Number of items to select
        d:                  Number of categories
        floors:             Constraints floor_i ≤ k_i for each i ∈ [1. . .d].
        ceilings:           Constraints k_i ≤ ceil_i for each i ∈ [1. . .d].
        items_per_category: n_i for i ∈[1 . . .d].

    Returns:
        L top K  chosen items from list I.
    """
    D = [[] for _ in ceilings]
    C = [0] * d
    M = [0] * d
    R = [math.floor(n / math.e) for n in items_per_category]
    T_i = [[] for _ in floors]
    u = d - sum(1 for floor in floors if floor == 0)
    w = 0
    iterator = iter(I)
    while u > 0 or w < K:
        x = next(iterator)
        i = x.category
        if M[i] < R[i]:
            # heapq.heappush(T_i[i], x)
            offer(T_i[i], x, floors[i])
        elif (C[i] < ceilings[i]) and (x.score > T_i[i][0].score):
            C[i] += 1
            heapq.heappop(T_i[i])
            if (floors[i] > 0) and C[i] == floors[i]:
                u -= 1
        # heapq.heappush(D[i], x)
        offer(D[i], x, ceilings[i])
        M[i] += 1
        w = sum(len(d_i) for d_i in D)
    W = []
    for d_i in D:
        for item in d_i:
            offer(W, item, w)
    W.sort(key= lambda x: x.score, reverse=True)
    L = algo1(W, K, d, floors, ceilings)
    return L

In [64]:
import pandas as pd
df = pd.read_csv("2024 Billionaire List.csv")
df['2024 Net Worth'] = (
    df['2024 Net Worth']
    .str.replace('$', '', regex=False)  # Remove the dollar sign
    .str.replace('B', '', regex=False)  # Remove the "B"
    .astype(float)          # Convert billions to integers
)
print(df.head())

                       Name   Age  2024 Net Worth          Industry  \
0  Bernard Arnault & family  75.0           233.0  Fashion & Retail   
1                 Elon Musk  52.0           195.0        Automotive   
2                Jeff Bezos  60.0           194.0        Technology   
3           Mark Zuckerberg  39.0           177.0        Technology   
4             Larry Ellison  79.0           141.0        Technology   

  Source of Wealth                 Title                      Organization  \
0             LVMH      Chairman and CEO  LVMH Moët Hennessy Louis Vuitton   
1    Tesla, SpaceX                   CEO                             Tesla   
2           Amazon  Chairman and Founder                            Amazon   
3         Facebook            Co-founder                    Meta Platforms   
4           Oracle       CTO and Founder                            Oracle   

   Self-Made  Self-Made Score  Philanthropy Score              Residence  \
0      False              Na

In [65]:
items = []
gender = {'M': 0, 'F': 1}
for index, row in df.iterrows():
    items.append(Item(row['Name'], gender[row["Gender"]], row['2024 Net Worth']))

for item in items:
    print(item)
# print(items)

name=Bernard Arnault & family, category=0, score=233.0
name=Elon Musk, category=0, score=195.0
name=Jeff Bezos, category=0, score=194.0
name=Mark Zuckerberg, category=0, score=177.0
name=Larry Ellison, category=0, score=141.0
name=Warren Buffett, category=0, score=133.0
name=Bill Gates, category=0, score=128.0
name=Steve Ballmer, category=0, score=121.0
name=Mukesh Ambani, category=0, score=116.0
name=Larry Page, category=0, score=114.0
name=Sergey Brin, category=0, score=110.0
name=Michael Bloomberg, category=0, score=106.0
name=Amancio Ortega, category=0, score=103.0
name=Carlos Slim Helu & family, category=0, score=102.0
name=Francoise Bettencourt Meyers & family, category=1, score=99.5
name=Michael Dell, category=0, score=91.0
name=Gautam Adani, category=0, score=84.0
name=Jim Walton & family, category=0, score=78.4
name=Rob Walton & family, category=0, score=77.4
name=Jensen Huang, category=0, score=77.0
name=Alice Walton, category=1, score=72.3
name=David Thomson & family, catego

In [66]:
L = algo1(items, 10, len(gender), [3,3], [6,6])
for item in L:
    print(item)
print(f"Utility = {sum(item.score for item in L)}")

name=Bernard Arnault & family, category=0, score=233.0
name=Elon Musk, category=0, score=195.0
name=Jeff Bezos, category=0, score=194.0
name=Mark Zuckerberg, category=0, score=177.0
name=Larry Ellison, category=0, score=141.0
name=Warren Buffett, category=0, score=133.0
name=Francoise Bettencourt Meyers & family, category=1, score=99.5
name=Alice Walton, category=1, score=72.3
name=Julia Koch & family, category=1, score=64.3
name=Jacqueline Mars, category=1, score=38.5
Utility = 1347.6


In [76]:
random.shuffle(items) # Shuffle the list
gender_counts = df['Gender'].value_counts()

L = algo2(items[:10], 3, len(gender), [1,1], [6,6], [gender_counts["M"], gender_counts["F"]])
for item in sorted(L, key= lambda item: item.score, reverse=True):
    print(item)

TypeError: list indices must be integers or slices, not str

In [56]:
random.shuffle(items) # Shuffle the list
L = algo3(items, 10, len(gender), [3,3], [6,6], [gender_counts["M"], gender_counts["F"]])
for item in sorted(L, key= lambda item: item.score, reverse=True):
    print(item)

print(f"Utility = {sum(item.score for item in L)}")


IndexError: list index out of range