In [1]:
#CLEANING
import recordlinkage as rl
from recordlinkage.preprocessing import clean



#Pre-cleaning function
def precleaning(df):
    # drop columns with 100% nan values
    df = df.dropna(axis=1, how='all')
    
    # drop duplicates
    df.drop_duplicates(inplace=True) 
    
    # make col names capital
    df.columns = df.columns.str.upper()
    
    # create index for the algorithm
    df.insert(0,"ID",range(1,df.shape[0]+1) ,True)
    
    return df



#Clean ZIP code column
def clean_zip(df, col='ZIP'):
    # Clean zip codes that are float type
    if df[col].dtype == float:
        df[col] = df[col].apply(lambda x: str(int(x)) if not pd.isna(x) else x)
        # If there were trailing ".0" after conversion to int, remove them
        df[col] = df[col].str.replace('.0', '')
    
    # Clean zip codes that are string type
    if df[col].dtype == object:
        df[col] = df[col].str.strip()  # Remove any leading/trailing spaces
        df[col] = df[col].str.replace('\D', '')  # Remove any non-digit characters
        df[col] = df[col].apply(lambda x: x.replace('-', '') if x else x)  # Remove hyphens
        df[col] = df[col].apply(lambda x: str(int(x)).zfill(5) if x else x)  # Convert to integer and pad with leading zeros

    return df

        
        

#Cleaning function to clean strings type cols using Record Linkage
def cleaning_cols(df):
    for  col in df.select_dtypes("object").columns:
        df[col]=clean(df[col],
                      lowercase=True, 
                      replace_by_none='[^ \\-\\_A-Za-z0-9]+', 
                      replace_by_whitespace='[\\-\\_]', 
                      strip_accents='unicode', 
                      remove_brackets=True, 
                      encoding='utf-8', 
                      decode_error='ignore')
    return df







#Algorithm function
def Sorted_Neighbourhood_Prediction(df1,df2, pred_comp=1, threshold=None,method_str=None,method_num=None,scale=None,offset=None, main_field_compare=None):
            #cleaning object cols for model redeability
            df1=cleaning_cols(df1)
            df2=cleaning_cols(df2)
            threshold =float(threshold)
                
            #resetiing index to core customerids of respective datasets
            df1=df1.set_index('ID')
            df2=df2.set_index('ID')
            
            ## creating mathced indexes using SoretdNeighbourHood Approach
            clx = rl.index.SortedNeighbourhood(main_field_compare, window=3)
            clx = clx.index(df1, df2)

            cr = rl.Compare()
            cr.string(main_field_compare, main_field_compare,method=method_str,threshold=threshold, label=main_field_compare)
            
            if select_box_unmatched_load_11:
                cr.numeric(select_box_unmatched_load_11, select_box_unmatched_load_11,scale=scale, offset=offset, label=select_box_unmatched_load_11)
            if select_box_unmatched_load_12:
                cr.numeric(select_box_unmatched_load_12, select_box_unmatched_load_12, method=method_num, scale=scale, offset=offset, label=select_box_unmatched_load_12)

            feature_vectors = cr.compute(clx,df1,df2)
            
            # predictions =feature_vectors[feature_vectors.sum(axis=1) > round(threshold*pred_comp,1)] 
            predictions =feature_vectors[feature_vectors.sum(axis=1) > (threshold*pred_comp)] 

            return feature_vectors,predictions
        