In [None]:
# using sklearn based on cosine similarity and tf-idf
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed
import pandas as pd
import re
from datetime import datetime
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import multiprocessing
import gc

df_struck_off = pd.read_csv('./output.csv')
df_acra = pd.read_csv('./acra-information-on-corporate-entities/csv/acra.csv', nrows=500000)

# loading and reading into dataframe
print('ANALYSES BEGINS! :)')
df_struck_off_staging = df_struck_off
df_acra_staging = df_acra
print('Complete: Data loaded into DF', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print('Number of rows in df_struck_off_staging', len(df_struck_off_staging))
print('Number of rows in df_acra_staging', len(df_acra_staging))

# to replace NULL/NaN values with empty strings
df_struck_off_staging['company_name_3'] = df_struck_off_staging['company_name_3'].fillna('')
df_acra_staging['entity_name'] =df_acra_staging['entity_name'].fillna('')


# define a regular expression that matches all non-alphanumeric and non-space characters and remove them
pattern = re.compile(r'[^\w\s]+')

df_struck_off_staging['company_name_3'] = df_struck_off_staging['company_name_3'].apply(lambda x: re.sub(pattern, '', x))
df_acra_staging['entity_name'] = df_acra_staging['entity_name'].apply(lambda x: re.sub(pattern, '', x))


# update strings to all uppercase()
df_struck_off_staging['company_name_3'] = df_struck_off_staging['company_name_3'].str.upper()
df_acra_staging['entity_name'] = df_acra_staging['entity_name'].str.upper()

# update strings to remove leading and trailing whitespaces
print('Start: Data Massaging', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
df_struck_off_staging['company_name_3'] = df_struck_off_staging['company_name_3'].str.strip()
df_acra_staging['entity_name'] = df_acra_staging['entity_name'].str.strip()

# to drop duplicated rows
df_struck_off_staging = df_struck_off_staging.drop_duplicates()
df_acra_staging = df_acra_staging.drop_duplicates()

print('Complete: Data Massaging', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))



In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed
from datetime import datetime
import scipy.sparse as sp
import joblib

print('Start: Vectorisation', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
# Define the TfidfVectorizer and fit it on the combined text column of both dataframes
vectorizer = TfidfVectorizer()
vectorizer.fit(pd.concat([df_struck_off_staging['company_name_3'], df_acra_staging['entity_name']]))

# Create a sparse matrix representation of the text column for each dataframe
struck_off_matrix = vectorizer.transform(df_struck_off_staging['company_name_3'])
acra_matrix = vectorizer.transform(df_acra_staging['entity_name'])

# Initialize the similarity matrix as a sparse matrix
similarity_matrix = sp.lil_matrix((struck_off_matrix.shape[0], acra_matrix.shape[0]))

print('End: Vectorisation', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

def compute_cosine_similarity(struck_off_matrix, acra_matrix, n_jobs=-2):
    """Compute cosine similarity between two sparse matrices using Joblib's parallel processing.

    Args:
        struck_off_matrix (scipy.sparse.csr_matrix): The sparse matrix for the 'company_name_3' column in df_struck_off_staging.
        acra_matrix (scipy.sparse.csr_matrix): The sparse matrix for the 'entity_name' column in df_acra_staging.
        n_jobs (int): The number of parallel jobs to run. Defaults to -1, which uses all available CPUs.

    Returns:
        pandas.DataFrame: A dataframe containing the most similar row and its score for each row in struck_off_matrix.
    """
    # Normalize the rows of the two matrices
    norm_1 = np.sqrt(np.asarray(struck_off_matrix.power(2).sum(axis=1)).flatten())
    norm_2 = np.sqrt(np.asarray(acra_matrix.power(2).sum(axis=1)).flatten())

    norm_1[norm_1 == 0] = 1
    norm_2[norm_2 == 0] = 1

    normalized_matrix_1 = struck_off_matrix.multiply(1 / norm_1[:, np.newaxis])
    normalized_matrix_2 = acra_matrix.multiply(1 / norm_2[:, np.newaxis])

    # Compute the dot product between the rows of the two matrices
    def compute_dot_product(row_idx):
        
        # similarity_scores = normalized_matrix_1.getrow(row_idx).dot(normalized_matrix_2.T).toarray().flatten()
        # max_idx = np.argmax(similarity_scores)
        # return pd.Series({'most_similar_row': max_idx, 'similarity_score': similarity_scores[max_idx]})
        
        similarity_scores = normalized_matrix_1.getrow(row_idx).dot(normalized_matrix_2.T).toarray().flatten()
        max_idx = np.argmax(similarity_scores)
        most_similar_entity_name = df_acra_staging.iloc[max_idx]['entity_name']
        similarity_score = similarity_scores[max_idx]
        row = df_struck_off_staging.iloc[row_idx]
        return pd.Series({'company_name_3': row['company_name_3'], 'most_similar_entity_name': most_similar_entity_name, 'similarity_score': similarity_score})

    results = Parallel(n_jobs=n_jobs, verbose=10, batch_size=1000)(delayed(compute_dot_product)(row_idx) for row_idx in range(normalized_matrix_1.shape[0]))

    return pd.DataFrame(results)

# Call the function and store the resulting dataframe in a variable
result_df = compute_cosine_similarity(struck_off_matrix, acra_matrix)

# result_df.head()
result_df.to_excel('results.xlsx')