In [59]:
import csv
from collections import Counter
import pandas as pd
from itertools import chain, combinations
from tabulate import tabulate
import time
import os

def powerset(iterable, max_size):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, min(len(s)+1, max_size+1)))

def subset_of_len_l(C_modified, l):
    return set(combinations(C_modified, l))

def create_set_W(df, W):
    return set(df.head(W)['Value'])

def JR(non_winners, approval_dict, W_set, k, n):
    start_time = time.time()  # Record start time
    duration = 0  # Initialize duration
    counts = {}
    breaks_jr = False

    for non_winner in non_winners:
        counts[non_winner] = 0
        for key, value in approval_dict.items():
            if non_winner in value and not set(W_set).intersection(value):
                counts[non_winner] += 1
#                 print(f"Counts for {non_winner}: {counts[non_winner]}")
                if counts[non_winner] >= n / k:
                    breaks_jr = True
                    break
    if breaks_jr:
        duration = time.time() - start_time
#         print("no JR")
        return 0, duration  # Return a tuple
    else:
        duration = time.time() - start_time
#         print("yes JR")
        return 1, duration  # Return a tuple


def EJR(approval_lists, winners, k, n, C,l):
    start_time = time.time()  # Record start time
    
#     print("C:",C)
#     print("l:",l)
#     print("n:",n)
#     print("k:",k)
#     print("ln/k:",(l*n)/k)
    C_modified = [candidate for candidate in C if greedy_list.loc[greedy_list['Value'] == candidate, 'Count'].iloc[0] >= l * n / k]
#     print("C_modified:",C_modified)
    
    S = subset_of_len_l(C_modified, l)

    for s in S:
        count = 0
        for approval_list in approval_lists.values():
            if set(s).issubset(approval_list):
                if abs(len(set(approval_list) & set(winners))) < l:
                    count += 1
        if count >= l * n / k:
            end_time = time.time()  # Record end time
            duration = end_time - start_time  # Calculate duration
#             print("no EJR")
#             print()
            return 0, duration
                
    end_time = time.time()  # Record end time
    duration = end_time - start_time  # Calculate duration
#     print("yes EJR")
#     print()
    return 1, duration

input_directory = 'EJR_Debugging_PbFiles'
file_paths = []
for root, dirs, files in os.walk(input_directory):
    for file in files:
        if file.endswith(".pb"):
            file_paths.append(os.path.join(root, file))

results_directory = 'EJR_Debugging_PbFiles_Results'
os.makedirs(results_directory, exist_ok=True)

results = []
summary_results = []
all_results_jr=[]
# Iterate over each file path
for file_path in file_paths:
    with open(file_path, 'r', newline='', encoding="utf-8") as csvfile:
        meta = {}
        projects = {}
        votes = {}
        section = ""
        header = []
        reader = csv.reader(csvfile, delimiter=';')
        for row in reader:
            if str(row[0]).strip().lower() in ["meta", "projects", "votes"]:
                section = str(row[0]).strip().lower()
                header = next(reader)
            elif section == "meta":
                meta[row[0]] = row[1].strip()
            elif section == "projects":
                projects[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    projects[row[0]][key.strip()] = row[it+1].strip()
            elif section == "votes":
                votes[row[0]] = {}
                for it, key in enumerate(header[1:]):
                    votes[row[0]][key.strip()] = row[it+1].strip()

    
    C = set(projects.keys())
    V = set(votes.keys())
    n = len(V)
    approval_dict = {key: set(value['vote'].split(',')) for key, value in votes.items()}

    lengths = [len(value) for value in approval_dict.values()]

    # Calculate the average length
    average_length = sum(lengths) / len(lengths)

    #print("Average length of values:", average_length)

    max_length = min(len(value) for value in approval_dict.values())

    # Creating a DataFrame with the projects and the number of votes they got
    all_values = set()
    for value_set in approval_dict.values():
        all_values.update(value_set)

    # Count occurrences of each value in the entire dictionary
    count_dict = Counter()
    for value_set in approval_dict.values():
        count_dict.update(value_set)

    # Create a list with unique values and their counts
    result_list = [[value, count_dict[value]] for value in all_values]

    greedy_list = pd.DataFrame(result_list, columns=['Value', 'Count'])
    greedy_list = greedy_list.sort_values(by='Count', ascending=False)
#     print(greedy_list)
    
    W_range = range(1, 11)
    results_single_file = []

    # Iterate over different values of W
    results_df_ejr = pd.DataFrame(columns=['l', 'k', 'JR', 'JR Duration', 'EJR', 'EJR Duration'])
    results_jr = []

    for W_value in W_range:
        W_set = create_set_W(greedy_list, W_value) # where should W_set be initiated? Here or in l for loop as well? for different W_values?
        k = len(W_set)
        non_winners = C - W_set
#         print("k = ", W_value)
        # Check for JR
        result1, duration1 = JR(non_winners, approval_dict, W_set, k, n)
        results_jr.append((W_value, result1))
#         print("Result1: ", result1)
#         print("Duration1: ", duration1)

        for l in range(1, W_value+1):
#             print("l = ", l)
            # Check for EJR
            result2, duration2 = EJR(approval_dict, W_set, k, n, C, l)
#             print("Result2: ", result2)
#             print("Duration2: ", duration2)

            results.append((l, W_value,result2, duration2))
  
    all_results_jr.extend(results_jr)
    results_df_ejr = pd.DataFrame(results, columns=['l','k', 'EJR', 'EJR Duration'])


    # Save results to a CSV file
    file_name = os.path.splitext(os.path.basename(file_path))[0] + '_results.csv'
    results_path = os.path.join(results_directory, file_name)
    results_df_ejr.to_csv(results_path, index=False)

    # Append results for each file path to the results list
results.append(results_df_ejr)
results_df_jr = pd.DataFrame(all_results_jr, columns=['W', 'JR'])
# # Concatenate results from all files into one dataframe
# all_results_df = pd.concat(results)

# # Calculate percentage of distinct values of column 'k' where 'jr' column is 1
# jr_column_1_count = all_results_df[all_results_df['JR'] == 1].groupby('k').size().reset_index(name='jr_1_count')

# # Calculate percentage of distinct values of column 'k' where 'ejr' column is 1
# ejr_column_1_count = all_results_df[all_results_df['EJR'] == 1].groupby('k').size().reset_index(name='ejr_1_count')

# # Merge the counts for 'jr' and 'ejr' by 'k'
# merged_counts = pd.merge(jr_column_1_count, ejr_column_1_count, on='k', how='outer').fillna(0)

# # Calculate the total distinct values of 'k' across all files
# # total_distinct_k_values = len(set(merged_counts['k']))
# total_distinct_k_values = len(file_paths) * 10
# # Calculate percentage of 'jr' and 'ejr' being 1 for each 'k'
# merged_counts['jr_percentage'] = (merged_counts['jr_1_count'] / total_distinct_k_values) * 100
# merged_counts['ejr_percentage'] = (merged_counts['ejr_1_count'] / total_distinct_k_values) * 100

# # Display the summary
# print("Summary for all files:")
# print(merged_counts)


In [60]:
results_df_ejr

Unnamed: 0,l,k,EJR,EJR Duration
0,1,1,1,0.010932
1,1,2,1,0.018585
2,2,2,1,0.009943
3,1,3,1,0.019206
4,2,3,1,0.011351
5,3,3,1,0.010961
6,1,4,1,0.028912
7,2,4,1,0.009815
8,3,4,1,0.009973
9,4,4,1,0.01097


In [63]:
# Initialize an empty DataFrame to store the new data
results_df_ejr_updated = pd.DataFrame(columns=['k', 'EJR'])

# Iterate over values of k from 1 to 10
for k_value in range(1, 11):
    # Check if any row has an EJR value of 0 for this k value
    ejr_value = 0 if 0 in results_df_ejr[results_df_ejr['k'] == k_value]['EJR'].values else 1
    
    # Append the k value and EJR status to the new DataFrame
    results_df_ejr_updated = results_df_ejr_updated.append({'k': k_value, 'EJR': ejr_value}, ignore_index=True)

print(results_df_ejr_updated)


    k EJR
0   1   1
1   2   1
2   3   1
3   4   1
4   5   1
5   6   1
6   7   1
7   8   0
8   9   0
9  10   1


  results_df_ejr_updated = results_df_ejr_updated.append({'k': k_value, 'EJR': ejr_value}, ignore_index=True)
  results_df_ejr_updated = results_df_ejr_updated.append({'k': k_value, 'EJR': ejr_value}, ignore_index=True)
  results_df_ejr_updated = results_df_ejr_updated.append({'k': k_value, 'EJR': ejr_value}, ignore_index=True)
  results_df_ejr_updated = results_df_ejr_updated.append({'k': k_value, 'EJR': ejr_value}, ignore_index=True)
  results_df_ejr_updated = results_df_ejr_updated.append({'k': k_value, 'EJR': ejr_value}, ignore_index=True)
  results_df_ejr_updated = results_df_ejr_updated.append({'k': k_value, 'EJR': ejr_value}, ignore_index=True)
  results_df_ejr_updated = results_df_ejr_updated.append({'k': k_value, 'EJR': ejr_value}, ignore_index=True)
  results_df_ejr_updated = results_df_ejr_updated.append({'k': k_value, 'EJR': ejr_value}, ignore_index=True)
  results_df_ejr_updated = results_df_ejr_updated.append({'k': k_value, 'EJR': ejr_value}, ignore_index=True)
  results_

In [61]:
results_df_jr

Unnamed: 0,W,JR
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
5,6,1
6,7,1
7,8,0
8,9,0
9,10,1
