In [None]:
import pandas as pd
from datasketch import MinHash, MinHashLSH
import numpy as np
from tqdm import tqdm  # Import tqdm for progress bar
import sys
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
sys.setrecursionlimit(1000000)

# Load CSV file
external_parties = pd.read_csv('data/external_parties_train.csv')

# Clean the DataFrame
#-----------------------


# Make sure you have the necessary NLTK resources
nltk.download('stopwords', force=True)
nltk.download('punkt', force=True)
nltk.download('punkt_tab', force=True)

# Initialize stopwords
stop_words = set(stopwords.words('english'))

# Process the text
external_parties['processed_text'] = (
    external_parties['party_info_unstructured']
    .fillna('')  # Handle NaN values
    .astype(str)  # Ensure string type
    .str.lower()  # Convert to lowercase
    .apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))  # Remove stopwords
)

print(external_parties['processed_text'])



In [None]:
class LSHEntityResolution:
    def __init__(self, dataframe):
        """
        Initialize with the DataFrame.
        :param dataframe: Pandas DataFrame containing entity data.
        """
        self.dataframe = dataframe

    # Generate MinHash for a given text
    def generate_minhash(self, text, num_perm=128):
        """
        Create MinHash for a given text.
        :param text: Input string.
        :param num_perm: Number of permutations for MinHash.
        :return: MinHash object.
        """
        if not text or pd.isna(text):
            return None
        mh = MinHash(num_perm=num_perm)
        for token in text.split():
            mh.update(token.encode("utf8"))
        return mh

    # Perform LSH on nodes
    def perform_lsh(self, threshold=0.8, num_perm=128):
        """
        Perform LSH to find similar entities.
        :param threshold: Similarity threshold for LSH.
        :param num_perm: Number of permutations for MinHash.
        :return: List of matched entity pairs.
        """
        lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
        node_hashes = {}

        # Generate combined text and MinHashes in a vectorized way
        combined_texts = self.dataframe['processed_text'].fillna('').astype(str)

        # Initialize tqdm progress bar for MinHash generation
        minhashes = []
        for text in tqdm(combined_texts, desc="Generating MinHashes", total=len(combined_texts)):
            mh = self.generate_minhash(text, num_perm)
            minhashes.append(mh)

        # Convert the list of minhashes into a pandas Series to handle it further
        minhashes = pd.Series(minhashes)

        # Filter out None values (in case of empty or invalid text)
        valid_minhashes = minhashes.dropna()

        # Insert nodes into LSH using the valid MinHashes
        for index, mh in tqdm(valid_minhashes.items(), desc="Inserting into LSH", total=len(valid_minhashes)):
            lsh.insert(str(index), mh)
            node_hashes[str(index)] = mh

        # Find similar pairs
        matches = []
        for node_id, mh in tqdm(node_hashes.items(), desc="Finding matches", total=len(node_hashes)):
            similar = lsh.query(mh)
            for sim in similar:
                if int(node_id) < int(sim):  # Avoid duplicate matches (e.g., (A, B) and (B, A))
                    matches.append((int(node_id), int(sim)))

        return matches


# Instantiate the LSHEntityResolution class
lsh_er = LSHEntityResolution(external_parties)

# Perform LSH to find matches
matches = lsh_er.perform_lsh(threshold=0.5, num_perm=254)

In [58]:
class Graph:
    # init function to declare class variables
    def __init__(self, V):
        self.V = V
        self.adj = {}
        for v in V:
            self.adj[v] = []
 
    def DFSUtil(self, temp, v, visited):
 
        # Mark the current vertex as visited
        visited[v] = True
 
        # Store the vertex to list
        temp.append(v)
 
        # Repeat for all vertices adjacent
        # to this vertex v
        for i in self.adj[v]:
            if visited[i] == False:
 
                # Update the list
                temp = self.DFSUtil(temp, i, visited)
        return temp
 
    # method to add an undirected edge
    def addEdge(self, v, w):
        self.adj[v].append(w)
        self.adj[w].append(v)
 
    # Method to retrieve connected components
    # in an undirected graph
    def connectedComponents(self):
        visited = {}
        cc = []
        for i in self.V:
            visited[i] = False
        for v in self.V:
            if visited[v] == False:
                temp = []
                cc.append(self.DFSUtil(temp, v, visited))
        return cc

In [None]:
from tqdm.notebook import tqdm

g = Graph(np.arange(len(external_parties)))

for idx, (a,b) in tqdm(enumerate(matches)):
    g.addEdge(a,b)

c = g.connectedComponents()

In [60]:
pred_id = np.arange(30000,30000+len(external_parties))

for idx, c_i in enumerate(c):
    for c_j in c_i:
        pred_id[c_j] = pred_id[c_i[0]]

In [None]:
pred_id

In [None]:
def evaluate_datasets(train, test):

    def create_pairwise_matrix(external_parties_df):
        """creates matrix that tells us if two parties are the same identity"""
        external_ids = external_parties_df['external_id'].values
        matrix = (external_ids[:, None] == external_ids).tolist()
        return matrix
    
    def compute_recall(matrix_truth, matrix_pred):
        n = len(matrix_truth)
        true_positive = 0
        false_negative = 0
        for i in range(n):
            for j in range(i + 1, n):
                if matrix_truth[i][j] and matrix_pred[i][j]:
                    true_positive += 1
                elif matrix_truth[i][j] and not matrix_pred[i][j]:
                    false_negative += 1
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
        return recall

    def compute_precision(matrix_truth, matrix_pred):
        n = len(matrix_truth)
        true_positive = 0
        false_positive = 0
        for i in range(n):
            for j in range(i + 1, n):
                if matrix_truth[i][j] and matrix_pred[i][j]:
                    true_positive += 1
                elif not matrix_truth[i][j] and matrix_pred[i][j]:
                    false_positive += 1
        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
        return precision


    train_matrix = create_pairwise_matrix(train)
    test_matrix  = create_pairwise_matrix(test)

    recall = compute_recall(train_matrix, test_matrix)
    precision = compute_precision(train_matrix, test_matrix)

    f1 = 2 * recall * precision / (recall + precision) if (recall + precision) > 0 else 0
    return f1

dict_ = {'transaction_reference_id': external_parties['transaction_reference_id'], 'external_id':pred_id}
print(dict_)
pred_dict = pd.DataFrame.from_dict(dict_)

print(evaluate_datasets(external_parties,pred_dict))

In [63]:
pd.DataFrame.from_dict(dict_).to_csv('submission_4.csv',index=False)