In [6]:
import pandas as pd
from datasketch import MinHash, MinHashLSH

# Load CSV files from the Kaggle input directory
account_booking = pd.read_csv('data/account_booking_test.csv')
external_parties = pd.read_csv('data/external_parties_test.csv')


class LSHEntityResolution:
    def __init__(self, dataframe):
        """
        Initialize with the DataFrame.
        :param dataframe: Pandas DataFrame containing entity data.
        """
        self.dataframe = dataframe

    # Generate MinHash for a given text
    def generate_minhash(self, text, num_perm=128):
        """
        Create MinHash for a given text.
        :param text: Input string.
        :param num_perm: Number of permutations for MinHash.
        :return: MinHash object.
        """
        if not text or pd.isna(text):
            return None
        mh = MinHash(num_perm=num_perm)
        for token in text.split():
            mh.update(token.encode("utf8"))
        return mh

    # Perform LSH on nodes
    def perform_lsh(self, threshold=0.8, num_perm=128):
        """
        Perform LSH to find similar entities.
        :param threshold: Similarity threshold for LSH.
        :param num_perm: Number of permutations for MinHash.
        :return: List of matched entity pairs.
        """
        lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
        node_hashes = {}

        # Insert nodes into LSH
        for index, row in self.dataframe.iterrows():
            combined_text = " ".join(
                filter(None, [
                    str(row['parsed_name']), 
                    str(row['parsed_address_street_name']), 
                    str(row['parsed_address_city'])                    
                ])
            )
            mh = self.generate_minhash(combined_text, num_perm)
            if mh:
                lsh.insert(str(index), mh)
                node_hashes[str(index)] = mh

        # Find similar pairs
        matches = []
        for node_id, mh in node_hashes.items():
            similar = lsh.query(mh)
            for sim in similar:
                if int(node_id) < int(sim):  # Avoid duplicate matches (e.g., (A, B) and (B, A))
                    matches.append((int(node_id), int(sim)))

        return matches


# Instantiate the LSHEntityResolution class
lsh_er = LSHEntityResolution(external_parties)

# Perform LSH to find matches
matches = lsh_er.perform_lsh(threshold=0.6, num_perm=264)


# Function to create submission file
def create_submission(dataframe, matches, remove_singletons=True):
    """
    Create a Kaggle submission file for entity resolution.
    :param dataframe: DataFrame containing the test data.
    :param matches: List of matched entity pairs.
    :param remove_singletons: If True, remove singletons from the submission.
    :return: DataFrame ready for submission.
    """
    # Initialize clusters
    cluster_map = {}
    cluster_id = 1

    # Group matches into clusters
    for match in matches:
        id1, id2 = match
        if id1 in cluster_map and id2 in cluster_map:
            # Merge clusters if both are already in clusters
            old_cluster = cluster_map[id2]
            for key, value in cluster_map.items():
                if value == old_cluster:
                    cluster_map[key] = cluster_map[id1]
        elif id1 in cluster_map:
            cluster_map[id2] = cluster_map[id1]
        elif id2 in cluster_map:
            cluster_map[id1] = cluster_map[id2]
        else:
            # Assign new cluster ID if neither is in a cluster
            cluster_map[id1] = cluster_id
            cluster_map[id2] = cluster_id
            cluster_id += 1

    # Assign unique clusters to singletons
    for index in dataframe.index:
        if index not in cluster_map:
            if remove_singletons:
                cluster_map[index] = None  # Mark singleton for removal
            else:
                cluster_map[index] = cluster_id  # Assign unique ID
                cluster_id += 1

    # Create the submission DataFrame
    submission = dataframe[['transaction_reference_id']].copy()
    submission['external_id'] = dataframe.index.map(cluster_map)

    # Remove singletons if required
    if remove_singletons:
        submission = submission.dropna(subset=['external_id'])

    # Convert external_id to integer
    submission['external_id'] = submission['external_id'].astype(int)

    return submission


# Prepare the submission DataFrame
submission_df = create_submission(account_booking, matches, remove_singletons=True)

# Save to CSV for Kaggle submission
submission_df.to_csv('submission.csv', index=False)
