In [1]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
df_keep = pd.DataFrame({'price1': [10, 20, 30], 'price1_name': ['price range', 'price range', 'price range'],
                        'price2': [40, 50, 60], 'price2_name': ['weighted avg price', 'weighted avg price', 'weighted avg price']})

df_keep2 = pd.DataFrame({'price1': [10, 20, 50000], 'price1_name': ['price range', 'price range', 'price range'],
                        'price2': [30, 40, 50], 'price2_name': ['weighted avg price', 'weighted avg price', 'weighted avg price']})

df_missing_col = pd.DataFrame({'price1': [10, 20, 50000], 'price1_name': ['price range', 'price range', 'price range'],
                               'price2': [30, 40, 50], 'price2_name': ['', '', '']})

df_2rows = pd.DataFrame({'price1': [20, 30], 'price1_name': ['price range', 'price range'],
                         'price2': [50, 60], 'price2_name': ['weighted avg price', 'weighted avg price']})

In [3]:
df_keep3 = pd.DataFrame({'price1': [1, 2, 3], 'price1_name': ['haha', 'haha', 'hahaha']})

df_throw = pd.DataFrame({'price1': [1, 2], 'price1_name': ['haha', 'haha']})

In [4]:
# df_2rows is a subset of df_keep, thus df_2rows should be thrown away
df_keep
df_2rows

# df_missing_col is a subset of df_keep2, thus df_missing_col should be thrown away
df_keep2
df_missing_col

# similarly, for df_throw, as the name implies
df_keep3
df_throw

Unnamed: 0,price1,price1_name,price2,price2_name
0,10,price range,40,weighted avg price
1,20,price range,50,weighted avg price
2,30,price range,60,weighted avg price


Unnamed: 0,price1,price1_name,price2,price2_name
0,20,price range,50,weighted avg price
1,30,price range,60,weighted avg price


Unnamed: 0,price1,price1_name,price2,price2_name
0,10,price range,30,weighted avg price
1,20,price range,40,weighted avg price
2,50000,price range,50,weighted avg price


Unnamed: 0,price1,price1_name,price2,price2_name
0,10,price range,30,
1,20,price range,40,
2,50000,price range,50,


Unnamed: 0,price1,price1_name
0,1,haha
1,2,haha
2,3,hahaha


Unnamed: 0,price1,price1_name
0,1,haha
1,2,haha


In [5]:
# Now we want to compare all dataframes under the same company, 
# such that finally there should only be df_keep and df_keep2 remains,
# while retaining the original dictionary format, which is:

# ideal_dict = {
#     'ABC123': [df_keep],
#     'ABC456': [df_keep2],
#     'DEF789': [df_keep, df_keep2],
#     'DEF101': [df_keep3]
# }

test_dict = {
    'ABC123': [df_keep, df_missing_col],
    'ABC456': [df_keep2, df_2rows],
    'DEF789': [df_keep, df_keep2, df_missing_col, df_2rows],
    'DEF101': [df_keep3, df_throw]
}

In [None]:
def extract_subset_tables(dictionary):
    """
    This function extracts subset dataframes (both column-wise and row_wise),
    and append those into a list.
    
    dictionary: dictionary with file_header as keys and list of dataframes as values
    """
    import re
    from itertools import chain, permutations
    company_list = []
    subset_df_list = []
    for keys in dictionary.keys():
        company_list.append(keys)
    trim_company_set = {re.sub('[0-9]', '', company) for company in company_list}
    for company in trim_company_set:
        subdict = {k: v for k, v in dictionary.items() if k.startswith(company)}
        tables_within_a_company = list(chain.from_iterable(subdict.values()))
        combo_list = list(permutations(range(len(tables_within_a_company)),2))
        subset_index = []
        for df1_index, df2_index in combo_list:
            if isinstance(tables_within_a_company[df1_index], pd.DataFrame) & isinstance(tables_within_a_company[df1_index], pd.DataFrame):
                df1_set = {x for x in (tables_within_a_company[df1_index].to_numpy().flatten()) if pd.notna(x) & (x != '')}
                df2_set = {x for x in (tables_within_a_company[df2_index].to_numpy().flatten()) if pd.notna(x) & (x != '')}
                if df1_set.issubset(df2_set):
                    subset_index.append(df1_index)
        to_be_del = list(set(subset_index))
        for index in to_be_del:
            subset_df_list.append(tables_within_a_company[index])
    return subset_df_list

In [None]:
extract_subset_tables(test_dict)

In [None]:
def remove_subset_tables(dictionary, list_subset_tables):
    from collections import defaultdict
    deleted_df = defaultdict(list) # Initialize a dictionary with list as values to house the indices
    for k in dictionary.keys():
        target_list = dictionary[k]
        duplicated_index = []
        for index in range(len(target_list)):
            for subset_df in list_subset_tables:
                if isinstance(target_list[index], pd.DataFrame):
                    if target_list[index].equals(subset_df):
                        duplicated_index.append(index)              
        to_be_del = list(set(duplicated_index)) # Remove duplicated indices in case there are >2 identical dataframes
        to_be_del.sort(reverse=True) # We want to remove the dataframes backward to not mess up with the indices
        for index in to_be_del:
            del target_list[index]
            deleted_df[k].append(index) # Save the deleted dataframes as dictionary with file_header as keys and list of respective indices as values
    return deleted_df

In [None]:
# Showing dataframes that are thrown away
remove_subset_tables(test_dict, extract_subset_tables(test_dict))

In [None]:
# Remaining dataframes, which are what we wanted
test_dict

In [10]:
def delete_subset(dictionary):
    '''
    This function removes any subset dataframe within the same key,
    outputs a dictionary indicating deleted tickers and respective indices.

    dictionary: dictionary with file_header as keys and list of dataframes as values
    '''
    from itertools import permutations
    from collections import defaultdict, Counter
    import pandas as pd
    import numpy as np
    deleted_df = defaultdict(list) # Initialize a dictionary with list as values to house the indices
    for k in dictionary.keys():
        target_list = dictionary[k]
        combo_list = list(permutations(range(len(target_list)),2)) # Use permutations here because sequence is important to compare subsets
        subset_index = []
        for df1_index, df2_index in combo_list:
            if isinstance(target_list[df1_index], pd.DataFrame) & isinstance(target_list[df2_index], pd.DataFrame): # Assert that we are comparing two dataframes            
                df1_set = {x for x in (target_list[df1_index].to_numpy().flatten()) if pd.notna(x) & (x != '')}
                df2_set = {x for x in (target_list[df2_index].to_numpy().flatten()) if pd.notna(x) & (x != '')} # Flatten a dataframe to get unique cell values
                df1_lst = [x for x in (target_list[df1_index].to_numpy().flatten()) if pd.notna(x) & (x != '')]
                df2_lst = [x for x in (target_list[df2_index].to_numpy().flatten()) if pd.notna(x) & (x != '')] # Flatten a dataframe to get cell values
                if df1_set.issubset(df2_set):
                    df1_count = Counter(df1_lst)
                    df2_count = Counter(df2_lst)
                    boolean = []
                    for keys in df1_count.keys(): # Further safety measures
                        if df1_count[keys] <= df2_count[keys]:
                            boolean.append(True)
                        else:
                            boolean.append(False)
                        if np.array(boolean).all():
                            subset_index.append(df1_index)
        to_be_del = list(set(subset_index)) # Remove duplicated indices in case there are >2 identical dataframes
        to_be_del.sort(reverse=True) # We want to remove the dataframes backward to not mess up with the indices
        for index in to_be_del:
            del target_list[index]
            deleted_df[k].append(index) # Save the deleted dataframe as dictionary with file_header as keys and list of respective indices as values
    return deleted_df

In [11]:
delete_subset(test_dict)

defaultdict(list, {'DEF789': [3, 2], 'DEF101': [1]})