In [None]:
import dremio_client.lib as dlib
import numpy as np

import pandas as pd
import h3pandas
from datetime import datetime
import time

from pyarrow import flight
from pyarrow.flight import FlightClient
import pyarrow.dataset as ds
import polars as pl

from rapidfuzz import fuzz
import datetime
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd


In [None]:
def get_bakong_data():

    query = f"""
        SELECT 
        
            DISTINCT MerchantName
            
        FROM "DataScience_DB"."General_DB".dbo.bk_outgoing
        where date_trunc('month',created_at) = '2023-12-01'
        and type = 'QR' AND MerchantType = '29'
        AND src_name <> MerchantName
        
        ORDER BY 1
        
    """

    data = dlib.simple_query(query)

    return data

In [None]:
def get_aba_data():
    query = f"""
        SELECT DISTINCT CIF, AC_NAME
        FROM dwh.FCCBOREP.VW_CUSTACC
        ORDER BY 1
    """
    data = dlib.simple_query(query)
    return data



In [None]:
#bakong_data = get_bakong_data()
#aba_data_set = get_aba_data()
aba_data_set = pd.read_csv('dataset/aba_data_1m.csv')

In [None]:
aba_data = aba_data_set

In [None]:
#bakong_data.shape
aba_data.shape

<hr>
Clean ABA Data
<hr>

In [None]:
aba_data.tail()

In [None]:
aba_data.info()

In [None]:
aba_data['AC_NAME'] = aba_data['AC_NAME'].str.upper()
aba_data['AC_NAME'] = aba_data['AC_NAME'].str.strip()
aba_data['AC_NAME'] = aba_data['AC_NAME'].str.replace(" ", "")

In [None]:
aba_data = aba_data.drop_duplicates()
aba_data = aba_data.dropna()

In [None]:
aba_data.shape

<hr>
Clean Bakong Data
<hr>


In [None]:
bakong_data = pd.read_csv('dataset/unique_merchantname_bk_500001_lastrow.csv')
bakong_data.shape

In [None]:
bakong_data.drop('Unnamed: 0', axis=1, inplace=True)
bakong_data.head()

In [None]:
bakong_data['Unique_MerchantName'] = bakong_data['Unique_MerchantName'].str.upper()
bakong_data['Unique_MerchantName'] = bakong_data['Unique_MerchantName'].str.strip()
bakong_data['Unique_MerchantName'] = bakong_data['Unique_MerchantName'].str.replace(" ", "")

bakong_data.shape

In [None]:
bk_cust = bakong_data.head(1000)
aba_cust = aba_data.head(10000)

In [None]:
print(bk_cust.shape)
print(aba_cust.shape)

In [None]:
#concate the dataframe 
data = pd.DataFrame({"bakong_cust" : bk_cust['Unique_MerchantName'] , "aba_cust" : aba_cust['AC_NAME']})
data.head()

In [None]:
data.dropna(subset=['aba_cust'], inplace=True)
#data.dropna(subset=['bakong_cust'], inplace=True)

data.head()

<hr>
Dask Python - Data Frame 
<hr>

In [None]:
data.shape

In [None]:

def apply_rapidfuzz_matching_numpy(bakong_cust_array, aba_cust_list):
    bakong_cust_array = np.array(bakong_cust_array, dtype=str)
    
    aba_cust_results = []
    score_results = []

    # Function to apply fuzzy matching to each element in bakong_cust_array
    for x in bakong_cust_array:
        match = process.extractOne(x, aba_cust_list, scorer=fuzz.token_set_ratio, score_cutoff=50)
        if match:
            aba_cust, score, _ = match
            aba_cust_results.append(aba_cust)
            score_results.append(score)
        else:
            aba_cust_results.append(None)
            score_results.append(None)

    # Convert the results to NumPy arrays
    aba_cust_results = np.array(aba_cust_results, dtype=object)  # Use dtype=object for mixed types
    score_results = np.array(score_results, dtype=float)  # Convert scores to float

    # Create a mask to filter out rows without a match
    mask = aba_cust_results != None

    # Filter the results arrays using the mask
    aba_cust_results = aba_cust_results[mask]
    score_results = score_results[mask]

    return aba_cust_results, score_results



In [None]:
# Convert the Pandas dataframe to a Dask dataframe
dask_df = dd.from_pandas(data, npartitions=10)
dask_df

In [None]:
client = Client(n_workers=2, memory_limit="5GB")
client

In [None]:
#create dask data to list
aba_cust_list = dask_df['aba_cust'].compute().tolist()

# save the dask data above to each cluster to process bakong data
distributed_aba_cust_list = client.scatter(aba_cust_list, broadcast=True)

In [None]:

start = datetime.datetime.now()
print("Start time:", datetime.datetime.now())

bakong_cust_array = data['bakong_cust']

aba_cust_results, score_results = apply_rapidfuzz_matching_numpy(bakong_cust_array, aba_cust_list)

end = datetime.datetime.now()
print("End time:", end)
print("Duration:", end - start)

In [None]:
import numpy as np

if aba_cust_results.ndim == 1:
    aba_cust_results = aba_cust_results.reshape(-1, 1)

if score_results.ndim == 1:
    score_results = score_results.reshape(-1, 1)

array_results = np.concatenate((aba_cust_results, score_results), axis=1)
array_results

In [None]:
result_from_array = pd.DataFrame({'aba_cust': array_results[:, 0], 'score': array_results[:, 1]})
result_from_array

<hr>
Terminal Cluster
<hr>

In [None]:
#close the cluster 
# client.close()