In [11]:
import csv
from collections import Counter
import pandas as pd
from itertools import chain, combinations
from tabulate import tabulate
import time
import os

# Define functions for JR, EJR, and other operations

def powerset(iterable, max_size):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, min(len(s)+1, max_size+1)))

def subset_of_len_l(C_modified, l):
    return set(combinations(C_modified, l))

def create_set_W(df, W):
    return set(df.head(W)['Value'])

def JR(non_winners, approval_dict, W_set, k, n):
    start_time = time.time()  # Record start time
    duration = 0  # Initialize duration

    counts = {}
    breaks_jr = False

    for non_winner in non_winners:
        counts[non_winner] = 0
        for key, value in approval_dict.items():
            if non_winner in value and not set(W_set).intersection(value):
                counts[non_winner] += 1
                if counts[non_winner] >= n / k:
                    breaks_jr = True
                    break
        if breaks_jr:
            duration = time.time() - start_time
            return 1, duration  # Return a tuple
    
    duration = time.time() - start_time
    return 0, duration  # Return a tuple


def EJR(approval_lists, winners, k, n, C,l):
    start_time = time.time()  # Record start time
    
    C_modified = [candidate for candidate in C if greedy_list.loc[greedy_list['Value'] == candidate, 'Count'].iloc[0] >= l * n / k]
    S = subset_of_len_l(C_modified, l)

    for s in S:
        count = 0
        for approval_list in approval_lists.values():
            if set(s).issubset(approval_list):
                if abs(len(set(approval_list) & set(winners))) < l:
                    count += 1
        if count >= l * n / k:
            end_time = time.time()  # Record end time
            duration = end_time - start_time  # Calculate duration
            return 0, duration
                
    end_time = time.time()  # Record end time
    duration = end_time - start_time  # Calculate duration
    return 1, duration

# Directory where input files are stored
input_directory = 'pabulib_files'

# List of file paths
file_paths = []

# Read all file paths from the directory
for root, dirs, files in os.walk(input_directory):
    for file in files:
        if file.endswith(".pb"):
            file_paths.append(os.path.join(root, file))

# Directory where result files will be saved
results_directory = 'results_pabulib'
os.makedirs(results_directory, exist_ok=True)

results = []
summary_results = []

# Iterate over each file path
for file_path in file_paths:
    with open(file_path, 'r', newline='', encoding="utf-8") as csvfile:
        meta = {}
        projects = {}
        votes = {}
        section = ""
        header = []
        reader = csv.reader(csvfile, delimiter=';')
        for row in reader:
            if str(row[0]).strip().lower() in ["meta", "projects", "votes"]:
                section = str(row[0]).strip().lower()
                header = next(reader)
            elif section == "meta":
                meta[row[0]] = row[1].strip()
            elif section == "projects":
                projects[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    projects[row[0]][key.strip()] = row[it+1].strip()
            elif section == "votes":
                votes[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    votes[row[0]][key.strip()] = row[it+1].strip()

    # Extract required data and perform operations as before...
    
    C = set(projects.keys())
    V = set(votes.keys())
    n = len(V)
    approval_dict = {key: set(value['vote'].split(',')) for key, value in votes.items()}

    lengths = [len(value) for value in approval_dict.values()]

    # Calculate the average length
    average_length = sum(lengths) / len(lengths)

    print("Average length of values:", average_length)

    max_length = min(len(value) for value in approval_dict.values())

    # Creating a DataFrame with the projects and the number of votes they got
    all_values = set()
    for value_set in approval_dict.values():
        all_values.update(value_set)

    # Count occurrences of each value in the entire dictionary
    count_dict = Counter()
    for value_set in approval_dict.values():
        count_dict.update(value_set)

    # Create a list with unique values and their counts
    result_list = [[value, count_dict[value]] for value in all_values]

    greedy_list = pd.DataFrame(result_list, columns=['Value', 'Count'])
    greedy_list = greedy_list.sort_values(by='Count', ascending=False)

    W_range = range(1, 11)
    results_single_file = []

    # Iterate over different values of W
    for l in W_range:
        for W_value in W_range:
            W_set = create_set_W(greedy_list, W_value)
            k = len(W_set)
            non_winners = C - W_set

            # Check for JR
            result1, duration1 = JR(non_winners, approval_dict, W_set, k, n)

            # Check for EJR
            result2, duration2 = EJR(approval_dict, W_set, k, n, C, l)

            results_single_file.append((l, W_value, result1, duration1, result2, duration2))

    results_df = pd.DataFrame(results_single_file, columns=['l', 'k', 'JR', 'JR Duration', 'EJR', 'EJR Duration'])

    # Save results to a CSV file
    file_name = os.path.splitext(os.path.basename(file_path))[0] + '_results.csv'
    results_path = os.path.join(results_directory, file_name)
    results_df.to_csv(results_path, index=False)

    # Append results for each file path to the results list
    results.append(results_df)

# Concatenate results from all files into one dataframe
all_results_df = pd.concat(results)

# Calculate percentage of distinct values of column 'k' where 'jr' column is 1
jr_column_1_count = all_results_df[all_results_df['JR'] == 1].groupby('k').size().reset_index(name='jr_1_count')

# Calculate percentage of distinct values of column 'k' where 'ejr' column is 1
ejr_column_1_count = all_results_df[all_results_df['EJR'] == 1].groupby('k').size().reset_index(name='ejr_1_count')

# Merge the counts for 'jr' and 'ejr' by 'k'
merged_counts = pd.merge(jr_column_1_count, ejr_column_1_count, on='k', how='outer').fillna(0)

# Calculate the total distinct values of 'k' across all files
# total_distinct_k_values = len(set(merged_counts['k']))
total_distinct_k_values
# Calculate percentage of 'jr' and 'ejr' being 1 for each 'k'
merged_counts['jr_percentage'] = (merged_counts['jr_1_count'] / total_distinct_k_values) * 100
merged_counts['ejr_percentage'] = (merged_counts['ejr_1_count'] / total_distinct_k_values) * 100

# Display the summary
print("Summary for all files:")
print(merged_counts)


Average length of values: 5.208264462809917
Average length of values: 8.0
Average length of values: 5.113419701784405
Average length of values: 6.887288440763414
Average length of values: 10.531591792461358
Summary for all files:
   jr_1_count   k  ejr_1_count  jr_percentage  ejr_percentage
0         0.0   1           50            0.0           100.0
1         0.0   2           50            0.0           100.0
2         0.0   3           50            0.0           100.0
3         0.0   4           50            0.0           100.0
4         0.0   5           50            0.0           100.0
5         0.0   6           50            0.0           100.0
6         0.0   7           50            0.0           100.0
7         0.0   8           50            0.0           100.0
8         0.0   9           50            0.0           100.0
9         0.0  10           50            0.0           100.0


In [10]:
all_results_df.head()

Unnamed: 0,l,k,JR,JR Duration,EJR,EJR Duration
0,1,1,0,0.000998,1,0.004781
1,1,2,0,0.002992,1,0.003989
2,1,3,0,0.001996,1,0.00698
3,1,4,0,0.000998,1,0.007978
4,1,5,0,0.001995,1,0.008076


In [9]:
total_distinct_k_values

50

In [None]:
import csv
from collections import Counter
import pandas as pd
from itertools import chain, combinations
from tabulate import tabulate
import time
import os

# Define functions for JR, EJR, and other operations

def powerset(iterable, max_size):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, min(len(s)+1, max_size+1)))

def subset_of_len_l(C_modified, l):
    return set(combinations(C_modified, l))

def create_set_W(df, W):
    return set(df.head(W)['Value'])

def JR(non_winners, approval_dict, W_set, k, n):
    start_time = time.time()  # Record start time
    duration = 0  # Initialize duration

    counts = {}
    breaks_jr = False

    for non_winner in non_winners:
        counts[non_winner] = 0
        for key, value in approval_dict.items():
            if non_winner in value and not set(W_set).intersection(value):
                counts[non_winner] += 1
                if counts[non_winner] >= n / k:
                    breaks_jr = True
                    break
    if breaks_jr:
        duration = time.time() - start_time
        return 0, duration  # Return a tuple
    else:
        duration = time.time() - start_time
        return 1, duration  # Return a tuple


def EJR(approval_lists, winners, k, n, C,l):
    start_time = time.time()  # Record start time
    
    C_modified = [candidate for candidate in C if greedy_list.loc[greedy_list['Value'] == candidate, 'Count'].iloc[0] >= l * n / k]
    S = subset_of_len_l(C_modified, l)

    for s in S:
        count = 0
        for approval_list in approval_lists.values():
            if set(s).issubset(approval_list):
                if abs(len(set(approval_list) & set(winners))) < l:
                    count += 1
        if count >= l * n / k:
            end_time = time.time()  # Record end time
            duration = end_time - start_time  # Calculate duration
            return 0, duration
                
    end_time = time.time()  # Record end time
    duration = end_time - start_time  # Calculate duration
    return 1, duration

# Directory where input files are stored
input_directory = 'pabulib_files'

# List of file paths
file_paths = []

# Read all file paths from the directory
for root, dirs, files in os.walk(input_directory):
    for file in files:
        if file.endswith(".pb"):
            file_paths.append(os.path.join(root, file))

# Directory where result files will be saved
results_directory = 'results_pabulib'
os.makedirs(results_directory, exist_ok=True)

results = []
summary_results = []

# Iterate over each file path
for file_path in file_paths:
    with open(file_path, 'r', newline='', encoding="utf-8") as csvfile:
        meta = {}
        projects = {}
        votes = {}
        section = ""
        header = []
        reader = csv.reader(csvfile, delimiter=';')
        for row in reader:
            if str(row[0]).strip().lower() in ["meta", "projects", "votes"]:
                section = str(row[0]).strip().lower()
                header = next(reader)
            elif section == "meta":
                meta[row[0]] = row[1].strip()
            elif section == "projects":
                projects[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    projects[row[0]][key.strip()] = row[it+1].strip()
            elif section == "votes":
                votes[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    votes[row[0]][key.strip()] = row[it+1].strip()

    # Extract required data and perform operations as before...
    
    C = set(projects.keys())
    V = set(votes.keys())
    n = len(V)
    approval_dict = {key: set(value['vote'].split(',')) for key, value in votes.items()}

    lengths = [len(value) for value in approval_dict.values()]

    # Calculate the average length
    average_length = sum(lengths) / len(lengths)

    #print("Average length of values:", average_length)

    max_length = min(len(value) for value in approval_dict.values())

    # Creating a DataFrame with the projects and the number of votes they got
    all_values = set()
    for value_set in approval_dict.values():
        all_values.update(value_set)

    # Count occurrences of each value in the entire dictionary
    count_dict = Counter()
    for value_set in approval_dict.values():
        count_dict.update(value_set)

    # Create a list with unique values and their counts
    result_list = [[value, count_dict[value]] for value in all_values]

    greedy_list = pd.DataFrame(result_list, columns=['Value', 'Count'])
    greedy_list = greedy_list.sort_values(by='Count', ascending=False)

    W_range = range(1, 11)
    results_single_file = []

    # Iterate over different values of W
    for l in W_range:
        for W_value in W_range:
            W_set = create_set_W(greedy_list, W_value)
            k = len(W_set)
            non_winners = C - W_set

            # Check for JR
            result1, duration1 = JR(non_winners, approval_dict, W_set, k, n)

            # Check for EJR
            result2, duration2 = EJR(approval_dict, W_set, k, n, C, l)

            results_single_file.append((l, W_value, result1, duration1, result2, duration2))

    results_df = pd.DataFrame(results_single_file, columns=['l', 'k', 'JR', 'JR Duration', 'EJR', 'EJR Duration'])

    # Save results to a CSV file
    file_name = os.path.splitext(os.path.basename(file_path))[0] + '_results.csv'
    results_path = os.path.join(results_directory, file_name)
    results_df.to_csv(results_path, index=False)

    # Append results for each file path to the results list
    results.append(results_df)

# Concatenate results from all files into one dataframe
all_results_df = pd.concat(results)

# Calculate percentage of distinct values of column 'k' where 'jr' column is 1
jr_column_1_count = all_results_df[all_results_df['JR'] == 1].groupby('k').size().reset_index(name='jr_1_count')

# Calculate percentage of distinct values of column 'k' where 'ejr' column is 1
ejr_column_1_count = all_results_df[all_results_df['EJR'] == 1].groupby('k').size().reset_index(name='ejr_1_count')

# Merge the counts for 'jr' and 'ejr' by 'k'
merged_counts = pd.merge(jr_column_1_count, ejr_column_1_count, on='k', how='outer').fillna(0)

# Calculate the total distinct values of 'k' across all files
# total_distinct_k_values = len(set(merged_counts['k']))
total_distinct_k_values = 1370
# Calculate percentage of 'jr' and 'ejr' being 1 for each 'k'
merged_counts['jr_percentage'] = (merged_counts['jr_1_count'] / total_distinct_k_values) * 100
merged_counts['ejr_percentage'] = (merged_counts['ejr_1_count'] / total_distinct_k_values) * 100

# Display the summary
print("Summary for all files:")
print(merged_counts)


In [19]:
all_results_df

Unnamed: 0,l,k,JR,JR Duration,EJR,EJR Duration
0,1,1,1,0.047889,1,0.016202
1,1,2,1,0.105674,1,0.024516
2,1,3,1,0.151017,1,0.027596
3,1,4,1,0.131291,1,0.106275
4,1,5,1,0.089854,1,0.090339
...,...,...,...,...,...,...
95,10,6,1,0.475794,1,0.070637
96,10,7,1,0.606285,1,0.100729
97,10,8,1,0.590395,1,0.106048
98,10,9,1,0.687107,1,0.100430
