In [1]:
from FDMBuilder.FDM_helpers import *
from thefuzz import process, fuzz
import collections

In [2]:
class TableMatcher:
    
    def __init__(self, table_a_id, table_b_id, dataset_id, 
                 suffix_prop=0.95, new_table_name=None):
        self.table_a_id = table_a_id
        self.table_b_id = table_b_id
        self.dataset_id = dataset_id
        self.suffix_prop = 0.95
        self.new_table_name = new_table_name
        self.build_matched_colnames_df(suffix_prop)
        
        
    def build_matched_colnames_df(self, suffix_prop):
        
        orig_names_a = list(get_table_schema_dict(self.table_a_id).keys())
        wo_prefix_names_a = self._remove_prefix_suffix(orig_names_a, 
                                                       prop=suffix_prop)
        orig_names_b = list(get_table_schema_dict(self.table_b_id).keys())
        wo_prefix_names_b = self._remove_prefix_suffix(orig_names_b, 
                                                       prop=suffix_prop)
        matches_df = pd.DataFrame(dict(
            orig_name_a = orig_names_a + ([None] * len(orig_names_b)),
            name_a = wo_prefix_names_a + ([None] * len(orig_names_b)),
            orig_name_b = ([None] * len(orig_names_a)) + orig_names_b,
            name_b = ([None] * len(orig_names_a)) + wo_prefix_names_b,
            match_score = 0
        ))

        for name_to_match, orig_name_to_match in zip(wo_prefix_names_b, 
                                                     orig_names_b):
            matches = process.extract(name_to_match, wo_prefix_names_a, limit=5)
            for match, score in matches:
                if score < 80:
                    continue
                match_mask = matches_df.name_a == match
                existing_score = matches_df[match_mask].match_score.values[0]
                if score > existing_score:
                    match_loc = matches_df[match_mask].index
                    matches_df.loc[match_loc, "name_b"] = name_to_match
                    matches_df.loc[match_loc, "orig_name_b"] = orig_name_to_match
                    matches_df.loc[match_loc, "match_score"] = score

        matches_df.sort_values("match_score", ascending=False, inplace=True)
        duplicated_name_b_mask = matches_df.name_b.duplicated()
        set_to_none_mask = matches_df.name_a.notna() & duplicated_name_b_mask
        set_to_none_cols = ["orig_name_b", "name_b", "match_score"]
        matches_df.loc[set_to_none_mask, set_to_none_cols] = None, None, 0
        duplicated_name_b_mask = matches_df.name_b.duplicated()
        name_b_none_mask = matches_df.name_b.isna()
        matches_df = matches_df[~duplicated_name_b_mask | name_b_none_mask]
        matches_df.sort_values(["name_a", "name_b"], inplace=True)

        self.colname_matches = matches_df
    
    
    def get_unmatched_names(self):
        name_a_null_mask = self.colname_matches.name_a.isna()
        name_b_null_mask = self.colname_matches.name_b.isna()
        unmatched_mask = name_a_null_mask | name_b_null_mask
        name_cols = ["name_a", "name_b"]
        return self.colname_matches[unmatched_mask][name_cols]
    
    
    def match_tables(self):
        def get_table_a_select_strings(row):
            if row.match_score == 100 or row.name_b is None:
                return row.orig_name_a + " AS " + row.name_a
            elif row.match_score > 0:
                return row.orig_name_a + " AS " + row.name_b
            elif row.name_a is None:
                return "NULL AS " + row.name_b
        table_a_select_strings = (self
                                  .colname_matches
                                  .apply(get_table_a_select_strings, axis=1))
        table_a_select_string = ", ".join(table_a_select_strings.values)

        def get_table_b_select_strings(row):
            if row.name_b is None:
                return "NULL AS " + row.name_a
            else:
                return row.orig_name_b + " AS " + row.name_b
        table_b_select_strings = (self
                                  .colname_matches
                                  .apply(get_table_b_select_strings, axis=1))
        table_b_select_string = ", ".join(table_b_select_strings.values)
        sql = f"""
            SELECT {table_a_select_string}
            FROM `{self.table_a_id}`
            UNION ALL
            SELECT {table_b_select_string}
            FROM `{self.table_b_id}`
        """
        table_a_alias = self.table_a_id.split(".")[-1]
        table_b_alias = self.table_b_id.split(".")[-1]
        if self.new_table_name is None:
            destination_table_id = ".".join([PROJECT, 
                                            self.dataset_id, 
                                            table_a_alias + "-" + table_b_alias])
        else:
            destination_table_id = ".".join([PROJECT, 
                                            self.dataset_id, 
                                            self.new_table_name])
        run_sql_query(sql, destination=destination_table_id)
    
                      
    def add_match(self, name_a, name_b):
        match_a_mask = self.colname_matches.name_a == name_a
        match_b_mask = self.colname_matches.name_b == name_b
        orig_name_b = self.colname_matches[match_b_mask].orig_name_b.values[0]
        if not match_a_mask.any():
            raise ValueError(f"No column named {name_a}")
        elif not match_b_mask.any():
            raise ValueError(f"No column named {name_b}")
        elif self.colname_matches[match_a_mask].name_b.values[0] != None:
            raise ValueError(f"Already a match for {name_a}")
        elif self.colname_matches[match_b_mask].name_a.values[0] != None:
            raise ValueError(f"Already a match for {name_b}")
        cols_to_set = ["name_b", "orig_name_b", "match_score"]
        set_values = name_b, orig_name_b, 1
        self.colname_matches.loc[match_a_mask, cols_to_set] = set_values
        self.colname_matches = self.colname_matches[~match_b_mask]
            
            
    def _find_prefix_suffix(self, names, prop):
        return_chars = []
        loop_idx = 0
        continue_search = True
        is_prefix = None
        while continue_search:
            char_counts = collections.Counter([name[loop_idx] 
                                               for name in names
                                               if len(name) > abs(loop_idx)])
            most_common_char, n_appears = char_counts.most_common(1)[0]
            prop_same = n_appears / char_counts.total()
            if prop_same > prop: 
                if loop_idx < 0:
                    return_chars.insert(0, most_common_char)
                    is_prefix = False 
                else:
                    return_chars.append(most_common_char)
                    is_prefix = True 
                if loop_idx < 0:
                    loop_idx -= 1
                else:
                    loop_idx += 1
            elif loop_idx == 0:
                loop_idx = -1
            else:
                continue_search = False
        return "".join(return_chars), is_prefix


    def _remove_prefix_suffix(self, names, prop=0.95):
        chars_to_remove, is_prefix = self._find_prefix_suffix(names, prop)
        if is_prefix:
            output_names = [name[len(chars_to_remove):] 
                            if chars_to_remove in name else name 
                            for name in names]
        elif is_prefix == False:
            output_names = [name[:-len(chars_to_remove)] 
                            if chars_to_remove in name else name 
                            for name in names]
        else:
            output_names = names
        return output_names


In [113]:
staging_dataset = CLIENT.get_dataset("CY_STAGING_DATABASE")

staging_tables = list(CLIENT.list_tables(staging_dataset))

sus_table_ids = [table.table_id for table in staging_tables
                 if table.table_id[:3] == "SUS"]

sus_table_ids

['SUS_Airedale_AE_20190201_to_20220131',
 'SUS_Airedale_AE_20210401_to_20220131',
 'SUS_Airedale_APC_010415_to_310119',
 'SUS_Airedale_APC_20190201_to_20200630',
 'SUS_Airedale_APC_20200701_to_20220131',
 'SUS_Airedale_AdmittedPatentCare_010415_to_310119',
 'SUS_Airedale_ECDS_010415_to_310119',
 'SUS_Airedale_OP_20190201_to_20200630',
 'SUS_Airedale_OP_20200701_to_2022013',
 'SUS_Airedale_OP_20210401_to_20220131',
 'SUS_Airedale_OutPatients_010415_to_310119',
 'SUS_Airedale__AandE_010415_to_310119',
 'SUS_BRI_APC_20200401_to_20220321',
 'SUS_BRI_AccidentAndEmergency_010415_to_300619 ',
 'SUS_BRI_AdmittedPatientCare_010415_to_300619_Part1',
 'SUS_BRI_AdmittedPatientCare_010415_to_300619_Part2',
 'SUS_BRI_EC_BackwardCompatible_20200401_to_20220322',
 'SUS_BRI_EC_Backward_Compatible_010415_to_300619 ',
 'SUS_BRI_OP_20200401_to_20220322',
 'SUS_BRI_OutPatients_010415_to_300619',
 'SUS_Calderdale_AE_AdditionalFields',
 'SUS_Calderdale_APCCHFTOutputPart1_010417_to_310620',
 'SUS_Calderdale_A

## A & E tables

two tables look v similar:

    'SUS_Airedale_AE_20190201_to_20220131'
    'SUS_Airedale_AE_20210401_to_20220131'
    
easily matched:

In [156]:
air_ae_1 = ".".join([PROJECT, 
                     "CY_STAGING_DATABASE", 
                     'SUS_Airedale_AE_20190201_to_20220131']) 
air_ae_2 = ".".join([PROJECT, 
                     "CY_STAGING_DATABASE", 
                     'SUS_Airedale_AE_20210401_to_20220131']) 
table_matcher = TableMatcher(table_a_id = air_ae_1,
                             table_b_id = air_ae_2,
                             dataset_id = "CY_SUS_DATA_TESTS",
                             new_table_name="air_ae_joined")
table_matcher.match_tables()

Naming of above two tables suspicious - looks like second table is duplicate of last 9 or so months of first:

In [122]:
air_ae_joined = ".".join([PROJECT, 
                     "CY_SUS_DATA_TESTS", 
                     'air_ae_joined']) 
air_ae_df = pd.read_gbq(air_ae_joined)
air_ae_df = air_ae_df.iloc[:-2,]

In [164]:
dup_cols = ["Arrival_Date", "Nhs_No"]
duplicates_mask = air_ae_df[dup_cols].duplicated()
# after_apr_2021 = pd.to_datetime(air_ae_df.Arrival_Date) > pd.to_datetime("2021-04-01") 
# air_ae_df[duplicates_mask & after_apr_2021]
duplicates_mask.sum()

1769

pretty sure they are duplicates!

There's another A&E table for airdale:

    'SUS_Airedale__AandE_010415_to_310119'

In [159]:
air_ae_19 = ".".join([PROJECT, 
                     "CY_STAGING_DATABASE", 
                     'SUS_Airedale_AE_20190201_to_20220131']) 
air_ae_15 = ".".join([PROJECT, 
                     "CY_STAGING_DATABASE", 
                     'SUS_Airedale__AandE_010415_to_310119']) 
table_matcher_2 = TableMatcher(table_a_id = air_ae_19,
                             table_b_id = air_ae_15,
                             dataset_id = "CY_SUS_DATA_TESTS",
                             new_table_name="air_ae_joined")
table_matcher_2.get_unmatched_names()

Unnamed: 0,name_a,name_b
35,Acc_Status_SnmdCt,
37,Access_Info_SnmdCt,
100,Activity_Treat_Function,
56,Acuity_SnmdCt,
53,Age_At_Cds,
...,...,...
196,,UDF1
197,,UDF2
198,,UDF3
199,,UDF4


In [None]:
table_ma