## 1. Initialization

> a. Read the excel/csv containing the entire query-output

> b. Initialize the parameters of thresholds, directories, and fields to concat for generating individual as well as combined match-score

> c. Get the unique list of countries in present dataframe

In [1]:
import time, numpy as np, pandas as pd, re, string, subprocess
from subprocess import Popen, PIPE

_STATIC_FILE_NAME="SM_Temp_Shortlist.xlsx"
_RAW_SCORES_DIRECTORY='Raw_Scores'
_CLEANED_SCORES_DIRECTORY='Cleaned_Scores'
_MASTER_DATA_DIRECTORY='Master_Data'
_FIELDS_TO_CONCAT={ 'CONCAT_ADDRESS':   ['ADDRESS_LINE_1','ADDRESS_LINE_2','ADDRESS_LINE_3'] }

_COLUMNS_TO_CLEAN=['ADDRESS_LINE_1','ADDRESS_LINE_2','ADDRESS_LINE_3','SITE_NAME','STATE','CITY','POSTAL_CODE']
_BINARIES_NAME="levenshtein"
_BINARIES_EXTENSION=".dll"
_MAXSIZE=5000

_THRESHOLD_FOR_INDIVIDUAL=0.85
_THRESHOLD_FOR_ADDRESS_COMBINED=0.75

_THRESHOLDS_DICT={
    'CONCAT_ADDRESS': _THRESHOLD_FOR_ADDRESS_COMBINED,
    'SITE_NAME': _THRESHOLD_FOR_INDIVIDUAL,
    'STATE': _THRESHOLD_FOR_INDIVIDUAL,
    'CITY': _THRESHOLD_FOR_INDIVIDUAL,
    'POSTAL_CODE': _THRESHOLD_FOR_INDIVIDUAL
    }
_COLS_FOR_TOTAL_MATCH_CALC=[colname+'_COMPARISON_SCORE' for colname in _THRESHOLDS_DICT]

_SCALING_FACTOR=3

_TOTAL_MATCHES_THRESHOLD=4


def write_df_to_csv(df, root_dir='', curr_country='', file_suffix='_temp.csv', index_flag=False):
    """
        DOCSTRING:  Writes the dataframe to a csv file and throw error if it fails.
        INPUT:      Dataframe, Target-Directory, Country-name, Suffix-of-csv-file, Index-Flag
        OUTPUT:     Dataframe csv at target-directory, or error.
    """
    try:
        abs_path=os.path.join(root_dir, curr_country+file_suffix)
        df.to_csv(abs_path, index=index_flag)
        print(f'\nSuccessfully created \{abs_path}!')
    except:
        print(f'\nSomething went wrong while writing the file. Please check if it is currently in use.')


def preprocess_dataframe(df):
    """
        DOCSTRING:  Imputes blank cells with '', replaces whitespace with underscore in country-name, and strips whitespace in cells.
        INPUT:      Dataframe
        OUTPUT:     Imputed and cleaned dataframe.
    """
    df.replace(np.nan, '', inplace=True)
    for colname in df.columns.values:
        if colname=='COUNTRY':
            df[colname]=df[colname].apply(lambda x: x.replace(' ','_'))
        df[colname]=df[colname].astype(str).apply(lambda x: x.strip())


def clean_dataframe(df, columns_to_clean=_COLUMNS_TO_CLEAN, fields_to_concat=_FIELDS_TO_CONCAT, replace_punctuations=True):
    """
        DOCSTRING:  Replaces special-chars in lowercase-converted cells if replace_punctuation==True, for the columns relevant to computing match-scores.
                    Generates the concatenated address fields, and drops the individual ones.
                    Overall will be left with alphanumeric chars in UTF-8 encoding.
        INPUT:      Dataframe, columns-to-clean, address-fields-to-concat, flag-to-replace-punctuations
        OUTPUT:     Imputed and cleaned dataframe.
    """
    # todo: Replace another special character which was causing Italy CSV file read to fail in R
    if replace_punctuations:
        special_chars=re.escape(string.punctuation)+''
        print('\nSpecial Character that will be replaced are:  ', special_chars)
    for colname in df.columns.values:
        if colname in columns_to_clean and replace_punctuations:
            df[colname]=df[colname].replace(r'['+special_chars+']', '', regex=True).str.lower()
    for colname, cols_to_concat in fields_to_concat.items():
        df[colname]=df[cols_to_concat].apply(lambda single_row: ''.join(single_row.values), axis=1)
    df.drop(labels=fields_to_concat['CONCAT_ADDRESS'], axis=1, inplace=True)



def deduplicate_dataset_R(rscript_command, script_name, args, current_directory):
    """
        DOCSTRING:  Invokes the R-code from Python using 32-bit Rscript 3.4.4 command.
                    Uses the Python subprocess module to create a new Pipe.
        INPUT:      Abs-path-of-32bit-Rscript-command, Script-to-invoke, Args-for-script, Current-directory
        OUTPUT:     Prints R-console output based on return-code. Rscript command generates a csv of the score_features, or errors out.
    """
    cmd = [rscript_command, script_name, args]
    pipe = Popen( cmd, cwd=current_directory, stdin=PIPE, stdout=PIPE, stderr=PIPE )
    output, error = pipe.communicate()

    if pipe.returncode==0:
        print('R OUTPUT:\n',output.decode())
    else:
        print('R OUTPUT:\n',output.decode())
        print('R ERROR:\n',error.decode())



def scale_up_comparison_score(df, colname='SITE_NAME_COMPARISON_SCORE', scaling_factor=_SCALING_FACTOR):
    """
        DOCSTRING:  Scale-up a column's binary-valued score by a factor
        INPUT:      Dataframe, score-colname, scaling-factor
        OUTPUT:     Scaled up dataframe.
    """
    print(f'\nScaling up {colname} by {scaling_factor}')
    df[colname]=df[colname].apply(lambda x: x*scaling_factor)



def return_top_match(df, child_column, score_key_column):
    """
        DOCSTRING:  Input Dataframe has SR_NUM_1 (child-col) matching against multiple SR_NUM_2.
                    Orders by child-col asc, score-col desc, and chooses the first possible entry of child-col.
        INPUT:      Dataframe-of-score-features-above-a-total-threshold, index-column (SR_NUM_1), total-score-column (NUM_OF_MATCHES_FOUND)
        OUTPUT:     Dataframe of normalized-score-features.
    """
    normalized_duplicates=df.sort_values(by=[child_column]).sort_values(by=[score_key_column],ascending=False)
    normalized_duplicates=normalized_duplicates.groupby(child_column).head(1).sort_values(by=[child_column])
    return normalized_duplicates



def replace_cyclic_dependencies(df, child_indicator, master_indicator, verbose=True):
    """
        DOCSTRING:  Input Dataframe has cases like-     Record45 matches with Record44, and Record67 matches with Record45.
                    In this case we should maintain-    Record67 matches with Record44.
                    Applies a for-loop and replaces values in master-column whenever such a cyclic-occurence observed.
        INPUT:      Dataframe-of-score-features-with-cyclic-indexes, child-column, master-column
        OUTPUT:     Dataframe of normalized-score-features.
    """
    arr=set(df[child_indicator].array)
    for val in df[master_indicator]:
        if val in arr:
            replace_val=df[df[child_indicator]==val][master_indicator].values[0]
            if verbose:
                print(val,' found in normalized_duplicates[',child_indicator,']. Replacement: ', replace_val)
            df[master_indicator].replace(val, replace_val, inplace=True)
    return df




def clean_score_features(curr_country, source_dir=_RAW_SCORES_DIRECTORY, target_dir=_CLEANED_SCORES_DIRECTORY, verbose=True):
    """
        DOCSTRING:  Reads the output of the Rscript command that is a csv of score_features having total-score greater than a total-threshold.
                    Invokes the top-match function, and the replace-cyclic-occurences function to get a set of clean-score-features.
                    Writes the dataframe in the Cleaned-Scores directory.
        INPUT:      country-name
        OUTPUT:     Dataframe of cleaned-normalized-score-features.
    """
    duplicates=pd.read_csv(os.path.join(source_dir, curr_country+'_Score_Features.csv'))
    duplicates['COUNTRY']=curr_country
    duplicates=return_top_match(df=duplicates, child_column='SR_NUM_1', score_key_column='NUM_OF_MATCHES_FOUND')
    duplicates=replace_cyclic_dependencies(df=duplicates, child_indicator='SR_NUM_1', master_indicator='SR_NUM_2', verbose=verbose)
    write_df_to_csv(df=duplicates, root_dir=target_dir, curr_country=curr_country, file_suffix='_Cleaned_Feature_Scores.csv')
    print('\n"SR_NUM_2" will be the master record')
    return duplicates


def get_deduplicated_master_records(normalized_duplicates, country_df):
    """
        DOCSTRING:  From the list of cleaned-normalized-score-features, use set-theory to find the unique list of masters.
                        a.  Think of 'SR_NUM_1' as the list of incoming Primary-keys, and 'SR_NUM_2' as the value to which it should be mapped based on match-score.
                        b.  Hence, union of 'SR_NUM_1' & 'SR_NUM_2' will be entire set of duplicates.
                        c.  Stand-alone records in the current country_batch_dataframe will not fall in this entire set of duplicates.
                        d.  Master-records set wil be the sets of 'SR_NUM_2' & #c above.
                        >   Universe                            = {SR_NUM}
                        >   a1                                  = {SR_NUM_1}
                        >   a2                                  = {SR_NUM_2}
                        >   Falls into any duplication-scenario = anymatch  = {a1 U a2}
                        >   Falls into no duplication-scenario  = nomatch   = {Universe - anymatch}
                        >   Total masters                       = {nomatch U a2}
        INPUT:      Dataframe-of-cleaned-normalized-score_features
        OUTPUT:     Unique set of master-record-ids (SR_NUM)
    """
    a1=set(normalized_duplicates['SR_NUM_1'].values.tolist())
    a2=set(normalized_duplicates['SR_NUM_2'].values.tolist())
    country_set=set(country_df.index.values.tolist())
    entire_duplicates_set=a1.union(a2)
    no_match_set=country_set.difference(entire_duplicates_set)
    master_record_ids=no_match_set.union(a2)
    return master_record_ids


def generate_deduplicated_master(curr_country, site_master_df, master_record_ids, target_dir=_MASTER_DATA_DIRECTORY):
    """
        DOCSTRING:  Use the original df to extract columns-info and generate the country-specific Master file.
        INPUT:      Country-name, Original-Dataframe, Unique set of master-record-ids (SR_NUM)
        OUTPUT:     Dataframe-for-country-with-original-info, Master-Dataframe
    """
    country_df_copy=site_master_df[site_master_df['COUNTRY']==curr_country]
    preprocess_dataframe(df=country_df_copy)
    clean_dataframe(df=country_df_copy, replace_punctuations=False)
    country_master_df=country_df_copy.loc[master_record_ids]
    write_df_to_csv(df=country_master_df, root_dir=target_dir, curr_country=curr_country, index_flag=True, file_suffix='_Master.csv')
    print(f'{country_df_copy.shape[0]} records get merged into {len(master_record_ids)}')
    return country_df_copy, country_master_df



def generate_dummy_cross_refs_for_masters(master_record_ids):
    """
        DOCSTRING:  Create a dummy cross-reference dataframe for master-records; Record45 matches with Record45 having a total match-score of maximum.
        INPUT:      Unique set of master-record-ids (SR_NUM)
        OUTPUT:     Dataframe-of-dummy-entries-for-master-cross-references.
    """
    master_record_score_array=[1.0]*len(master_record_ids)
    master_record_df_dict={
        'SR_NUM_1': list(master_record_ids),
        'SR_NUM_2': list(master_record_ids),
        'SITE_NAME_COMPARISON_SCORE': master_record_score_array,
        'STATE_COMPARISON_SCORE': master_record_score_array,
        'CITY_COMPARISON_SCORE': master_record_score_array,
        'CONCAT_ADDRESS_COMPARISON_SCORE': master_record_score_array,
        'POSTAL_CODE_COMPARISON_SCORE': master_record_score_array }

    cross_ref_df=pd.DataFrame(master_record_df_dict)
    cross_ref_df['COUNTRY']=curr_country
    scale_up_comparison_score(cross_ref_df,'CONCAT_ADDRESS_COMPARISON_SCORE',_SCALING_FACTOR)
    cross_ref_df['NUM_OF_MATCHES_FOUND']=cross_ref_df[_COLS_FOR_TOTAL_MATCH_CALC].sum(axis=1)
    return cross_ref_df


def generate_final_cross_refs(cross_ref_df, normalized_duplicates, target_dir=_MASTER_DATA_DIRECTORY):
    """
        DOCSTRING:  Merge the dummy cross-reference of masters, with the cleaned-normalized-feature-scores.
        INPUT:      Dataframe-of-dummy-entries-for-master-cross-references, Dataframe-of-cleaned-normalized-score_features
        OUTPUT:     Dataframe-of-cross-references.
    """
    cross_ref_df=cross_ref_df.append(normalized_duplicates)
    cross_ref_df.sort_values(by=['SR_NUM_1'], axis=0, inplace=True)
    write_df_to_csv(df=cross_ref_df, root_dir=target_dir, curr_country=curr_country, file_suffix='_Raw_Cross_Ref.csv')
    return cross_ref_df



def generate_cross_ref_report(cross_ref_df, country_df, target_dir=_MASTER_DATA_DIRECTORY):
    """
        DOCSTRING:  Creates cross-reference report by performing left-join of cross-reference-dataframe with the original-info in country-df.
                        a. Merge the master_cross_reference_df with the country_batch_dataframe as a left-outer-join on Primary-key='SR_NUM_1'
                        b. Merge this master_cross_reference_df with the country_batch_dataframe as a left-outer-join on Primary-key='SR_NUM_2'
                    Writes the dataframe in the Master-Data directory.
        INPUT:      Dataframe-of-cross-references, Dataframe-for-country-with-original-info
        OUTPUT:     Dataframe-of-cross-references-with-original-info.
    """
    country_df.reset_index(inplace=True)
    country_df_colnames=country_df.columns.values

    country_df.columns=[colname+'_1' for colname in country_df_colnames]
    cross_ref_df=cross_ref_df.merge(country_df, how='left', on='SR_NUM_1')

    country_df.columns=[colname+'_2' for colname in country_df_colnames]
    cross_ref_df=cross_ref_df.merge(country_df, how='left', on='SR_NUM_2')

    columns_in_report_format=['SR_NUM_1', 'SR_NUM_2', 'SITE_NAME_1','SITE_NAME_2','SITE_NAME_COMPARISON_SCORE','STATE_1','STATE_2','STATE_COMPARISON_SCORE', 'CITY_1', 'CITY_2','CITY_COMPARISON_SCORE','CONCAT_ADDRESS_1','CONCAT_ADDRESS_2','CONCAT_ADDRESS_COMPARISON_SCORE', 'POSTAL_CODE_1','POSTAL_CODE_2',   'POSTAL_CODE_COMPARISON_SCORE','NUM_OF_MATCHES_FOUND']
    cross_ref_df=cross_ref_df[columns_in_report_format]
    write_df_to_csv(df=cross_ref_df, root_dir=target_dir, curr_country=curr_country, file_suffix='_Cross_Ref_Full_Report.csv')



site_master_df=pd.read_excel(_STATIC_FILE_NAME, index_col=0)
preprocess_dataframe(site_master_df)
print('\nColumns: ', site_master_df.columns.values,'\n')
countries=list(site_master_df['COUNTRY'].unique())
small_countries=list(site_master_df['COUNTRY'].value_counts()[site_master_df['COUNTRY'].value_counts() > _MAXSIZE].index)
for c in small_countries:
    countries.remove(c)
print('\nUnique Countries having counts greater than ', _MAXSIZE, ': ', countries)


Columns:  ['SOURCE_IDENTIFIER' 'DATA_SOURCE_NAME' 'PROTOCOL_NUMBER' 'SITE_NUM'
 'UNIQUE_SITE_ID' 'COUNTRY' 'SITE_NAME' 'STATE' 'CITY' 'ADDRESS_LINE_1'
 'ADDRESS_LINE_2' 'ADDRESS_LINE_3' 'POSTAL_CODE' 'SITE_STATUS'] 


Unique Countries having counts greater than  5000 :  ['United_States']


In [2]:
# todo: for loop- Research on multithreading to speed up country-wise batches. RAM might crash for incoming batch-size>4000.
c=0
curr_country=countries[c]
country_df=site_master_df[site_master_df['COUNTRY']==curr_country]
clean_dataframe(country_df, columns_to_clean=_COLUMNS_TO_CLEAN, fields_to_concat=_FIELDS_TO_CONCAT, replace_punctuations=True)
write_df_to_csv(df=country_df[_THRESHOLDS_DICT.keys()], curr_country=curr_country, file_suffix='_country_df.csv', index_flag=True)
print(f'\n{curr_country} has {country_df.shape[0]} records')


Special Character that will be replaced are:   !"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~

Successfully created \United_States_country_df.csv!

United_States has 1000 records


In [None]:
deduplicate_dataset_R(
    rscript_command="C:/Program Files/R/R-3.4.4/bin/i386/Rscript",  script_name="Site_Master_Record_Linkage.R",
    args=f"{_BINARIES_NAME} {_BINARIES_EXTENSION} {_THRESHOLD_FOR_INDIVIDUAL} {_THRESHOLD_FOR_ADDRESS_COMBINED} {_SCALING_FACTOR} {curr_country} {_RAW_SCORES_DIRECTORY} {_TOTAL_MATCHES_THRESHOLD}",
    current_directory="C:/Users/vdeshpande/Desktop/Del_Project-Takeda/Site_Master_Repo/"
    )

In [None]:
# Clean and normalize the score features
normalized_duplicates = clean_score_features(curr_country=curr_country, source_dir=_RAW_SCORES_DIRECTORY, target_dir=_CLEANED_SCORES_DIRECTORY, verbose=False)
# Get the unique set of master-record-ids
master_record_ids = get_deduplicated_master_records(normalized_duplicates=normalized_duplicates, country_df=country_df)
# Get the country-master-df and a copy of original country-df
country_df_copy, country_master_df = generate_deduplicated_master(curr_country, site_master_df=site_master_df, master_record_ids=master_record_ids, target_dir=_MASTER_DATA_DIRECTORY)
# Create a dummy set of cross-refs for masters
cross_ref_df = generate_dummy_cross_refs_for_masters(master_record_ids=master_record_ids)
# Create full set of cross-refs for country-df
cross_ref_df = generate_final_cross_refs(cross_ref_df=cross_ref_df, normalized_duplicates=normalized_duplicates, target_dir=_MASTER_DATA_DIRECTORY)
# Create the csv for the cross-ref report
generate_cross_ref_report(cross_ref_df=cross_ref_df, country_df_copy=country_df_copy, target_dir=_MASTER_DATA_DIRECTORY)