In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed
import pandas as pd
import re
from datetime import datetime
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer


print('Just checking how many rows are there in the source excelfile...')
raw_df_vendor_customer = pd.read_excel('./Vendor Customer.xlsx')
raw_df_interested_party = pd.read_excel('./Interested Parties.xlsx')
print('Number of rows in raw_df_vendor_customer', len(raw_df_vendor_customer))
print('Number of rows in raw_df_interested_party', len(raw_df_interested_party))


print('ANALYSES BEGINS! :)')
# loading and reading into dataframe
df_vendor_customer = pd.read_excel('./Vendor Customer.xlsx')
df_interested_party = pd.read_excel('./Interested Parties.xlsx')
print('Complete: Data loaded into DF', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print('Number of rows in df_vendor_customer', len(df_vendor_customer))
print('Number of rows in df_interested_party', len(df_interested_party))


print('Start: Data Massaging', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
# set the index as a column, to be used as a mapping field to join df_vendor_customer
df_interested_party = df_interested_party.reset_index().rename(columns={'index': 'index_ID'})

# to replace NULL/NaN values with empty strings
df_vendor_customer['Name'] = df_vendor_customer['Name'].fillna('')
df_interested_party['Interested Party List'] =df_interested_party['Interested Party List'].fillna('')


# define a regular expression that matches all non-alphanumeric and non-space characters and remove them
pattern = re.compile(r'[^\w\s]+')

df_vendor_customer['Name_Cleaned'] = df_vendor_customer['Name'].apply(lambda x: re.sub(pattern, '', x))
df_interested_party['Interested Party List_Cleaned'] = df_interested_party['Interested Party List'].apply(lambda x: re.sub(pattern, '', x))


# update strings to all uppercase()
df_vendor_customer['Name_Cleaned'] = df_vendor_customer['Name_Cleaned'].str.upper()
df_interested_party['Interested Party List_Cleaned'] = df_interested_party['Interested Party List_Cleaned'].str.upper()


# define the list of common words to remove, to remove noise (similar to stopwords concept)
# create a regular expression pattern that includes word boundaries (\b) before and after each word in the list of words to remove. This ensures that the str.replace method only removes the word when it appears as a standalone word, and not as a substring of other words.
words_to_remove = ['PTE', 'LTD', 'LLC', 'CO', 'SDN', 'BHD', 'PTY LIMITED', 'PTY', 'LIMITED', 'PVT', 'PRIVATE', 'INC', 'LLP', 'COMPANY']
pattern = r'\b(' + '|'.join(words_to_remove) + r')\b'


# for word in words_to_remove:
#     df_vendor_customer['Name_Cleaned'] = df_vendor_customer['Name_Cleaned'].str.replace(pattern, '', regex=True)
#     df_interested_party['Interested Party List_Cleaned'] = df_interested_party['Interested Party List_Cleaned'].str.replace(pattern, '', regex=True)


# update strings to remove leading and trailing whitespaces
df_vendor_customer['Name_Cleaned'] = df_vendor_customer['Name_Cleaned'].str.strip()
df_interested_party['Interested Party List_Cleaned'] = df_interested_party['Interested Party List_Cleaned'].str.strip()

# to drop duplicated rows
df_vendor_customer = df_vendor_customer.drop_duplicates()
df_interested_party = df_interested_party.drop_duplicates()

print('Complete: Data Massaging', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print('Number of rows in df_vendor_customer after data massaging', len(df_vendor_customer))
print('Number of rows in df_interested_party after data massaging', len(df_interested_party))


Just checking how many rows are there in the source excelfile...
Number of rows in raw_df_vendor_customer 186197
Number of rows in raw_df_interested_party 3587
ANALYSES BEGINS! :)
Complete: Data loaded into DF 2023-04-17 11:55:50
Number of rows in df_vendor_customer 186197
Number of rows in df_interested_party 3587
Start: Data Massaging 2023-04-17 11:55:50
Complete: Data Massaging 2023-04-17 11:55:51
Number of rows in df_vendor_customer after data massaging 176158
Number of rows in df_interested_party after data massaging 3587


In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed
from datetime import datetime
import scipy.sparse as sp
import joblib

print('Start: Vectorisation', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
# Define the TfidfVectorizer and fit it on the combined text column of both dataframes
vectorizer = TfidfVectorizer() # text extraction technique for text
vectorizer.fit(pd.concat([df_vendor_customer['Name_Cleaned'], df_interested_party['Interested Party List_Cleaned']]))

# Create a sparse matrix representation of the text column for each dataframe, TfidfVectorizer uses sparse matrix by default. To convert to dense matrix, use .toarray(). 
vendor_customer_matrix = vectorizer.transform(df_vendor_customer['Name_Cleaned'])
interested_party_matrix = vectorizer.transform(df_interested_party['Interested Party List_Cleaned'])
print('Vendor-Customer matrix shape:', vendor_customer_matrix.shape)
print('Interested Party matrix shape:', interested_party_matrix.shape)


""" Initialize the similarity matrix as a sparse matrix. 
# The .shape attribute is a property of a NumPy array or a sparse matrix in Python. It returns a tuple containing the dimensions of the array or matrix, in the format (number of rows, number of columns).
# .shape[0] represents to get the number of rows in both vendor_customer_matrix and interested_party_matrix. Please note the matrix is based on a single column, and therefore .shape[0] would make sense
# similarity matrix to have the same number of rows as the vendor_customer_matrix and the same number of columns as the interested_party_matrix.
# The lil_matrix() function is used to create a sparse matrix in the LIL (List of Lists) format. 
# The LIL format is a way of storing sparse matrices in memory that makes it easy to add new elements to the matrix one at a time. It's a good choice if you want to build up a sparse matrix incrementally.
"""
similarity_matrix = sp.lil_matrix((vendor_customer_matrix.shape[0], interested_party_matrix.shape[0]))
print('Similarity Matrix Shape:', similarity_matrix.shape)

print('End: Vectorisation', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))



def compute_cosine_similarity(vendor_customer_matrix, interested_party_matrix, n_jobs, batch_size):
    
    # Compute the cosine similarities between the rows of the two matrices
    # This line calculates the cosine similarity between each row of the vendor-customer matrix and each row of the interested party matrix, resulting in a matrix of similarity scores.
    similarity_scores = cosine_similarity(vendor_customer_matrix, interested_party_matrix)
        
    # Find the most similar interested party and its similarity score for each row of the vendor_customer_matrix
    def find_most_similar(row_idx):
        
        # This line finds the index of the row in the interested party matrix that has the highest cosine similarity score with the current row of the vendor-customer matrix.
        max_idx = np.argmax(similarity_scores[row_idx]) 
        
        # This line retrieves the highest similarity score for the current row of the vendor-customer matrix.
        max_similarity_score = similarity_scores[row_idx, max_idx] 
        
        # This line retrieves the "Interested Party List_Cleaned" column value from the row in the interested party dataframe that corresponds to the index max_idx found above.
        most_similar_Interested_Party_List_Cleaned = df_interested_party.iloc[max_idx]['Interested Party List_Cleaned'] 
        corresponding_interested_party_source = df_interested_party.iloc[max_idx]['Interested Party Source'] 
        
        # to return all relevant columns to put into a dataframe
        return pd.Series({
            'Code': df_vendor_customer.iloc[row_idx]['Code'],
            'Name_Cleaned': df_vendor_customer.iloc[row_idx]['Name_Cleaned'],
            'most_similar_Interested_Party_List_Cleaned': most_similar_Interested_Party_List_Cleaned,
            'Corresponding Interested Party Source': corresponding_interested_party_source,
            'similarity_score': max_similarity_score
        })

    """# Use parallel processing to speed up computation
    # Creates a sequence of delayed function calls, each of which corresponds to a row of vendor_customer_matrix.
    # For each row index row_idx in the range range(vendor_customer_matrix.shape[0]), it creates a delayed object, which is a callable that will execute the find_most_similar function with the argument row_idx when called. 
    # The delayed function is a utility function provided by the joblib library that creates a wrapper function around a given function, making it suitable for parallel execution.
    """
    results = Parallel(n_jobs=n_jobs, verbose=10, batch_size=batch_size)(delayed(find_most_similar)(row_idx) for row_idx in range(vendor_customer_matrix.shape[0]))

    return pd.DataFrame(results)



# Define the batch size and number of jobs (ie CPU Cores) for parallel processing
n_jobs = -2
batch_size = 1000
print('Number of CPU cores available:', joblib.cpu_count(), '\nNumber of CPU cores to use:', n_jobs, '\nBatch Size:', batch_size)



print('Start: Parallel Processing', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
# Call the function and store the resulting dataframe in a variable
result_df = compute_cosine_similarity(vendor_customer_matrix, interested_party_matrix, n_jobs, batch_size)
print('End: Parallel Processing', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


print('Start: Writing to Excel', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
result_df.head()
result_df.to_excel('results_sparse.xlsx')
print('End: Writing to Excel', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


# shape size: (176158, 70792)
# runtime: 3min
# batchsize: 1000, CPU cores: -2



Start: Vectorisation 2023-04-17 11:41:23
Vendor-Customer matrix shape: (176158, 70792)
Interested Party matrix shape: (3587, 70792)
Similarity Matrix Shape: (176158, 3587)
End: Vectorisation 2023-04-17 11:41:25
Number of CPU cores available: 4 
Number of CPU cores to use: -2 
Batch Size: 1000
Start: Parallel Processing 2023-04-17 11:41:25


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-2)]: Done 2006 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-2)]: Done 7006 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-2)]: Done 12006 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-2)]: Done 19006 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-2)]: Done 26006 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-2)]: Done 35006 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-2)]: Done 44006 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-2)]: Done 55006 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-2)]: Done 66006 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-2)]: Done 79006 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-2)]: Done 92006 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-2)]: Done 107006 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-2)]: Done 122006 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-2)]: Done 13

End: Parallel Processing 2023-04-17 11:44:49
Start: Writing to Excel 2023-04-17 11:44:49
End: Writing to Excel 2023-04-17 11:45:13
