In [1]:
import pandas as pd
import numpy as np
import re
# from geopy.geocoders import Nominatim
 
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

In [None]:
# Define function to run tf-idf and cosine similarity algorithm
def string_grouper(df, col_name, thres, bool_zip):
    # ngram
    def ngrams(string, n=3):
        ''' Add special cases that are not included in Step 1 data preparation
            (e.g. not to mix STREET with SAINT) '''
        # Special case 1:
        # Special case 2:
        # Special case 3:
        # Special case 4:
 
        chars_to_remove = [")","(",".","|","[","]","{","}","'","#"]
        rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
        # Remove the list of chars defined above
        string = re.sub(rx, ' ', string)
        string = string.replace('&', '')
        string = string.replace(',', '')
        string = string.replace('.', '')
        string = string.replace('-', '')    
 
        # Get rid of multiple spaces and replace with a single space
        if bool_zip== 1:
            string = re.sub(' +', '', string)
        else:
            string = re.sub(' +', '', string)
        string = string.strip()
       
        # Pad names for ngrams...
        string = ' ' + string + ' '
        ngrams = zip(*[string[i:] for i in range(n)])
   
        return [''.join(ngram) for ngram in ngrams]
 
    # Define function to calculate cosine distances
    def awesome_cossim_top(A, B, ntop, lower_bound=0):
        # force A and B as a CSR matrix.
        # If they have already been CSR, there is no overhead
        A = A.tocsr()
        B = B.tocsr()
        M, _ = A.shape
        _, N = B.shape
 
        idx_dtype = np.int32
 
        nnz_max = M*ntop
 
        indptr = np.zeros(M+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)
 
        ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)
 
        return csr_matrix((data,indices,indptr),shape=(M,N))
 
    # Define function to unpack the sparse matrix
    def get_matches_df(sparse_matrix, name_vector, top=100):
        non_zeros = sparse_matrix.nonzero()
   
        sparserows = non_zeros[0]
        sparsecols = non_zeros[1]
   
        if top:
            nr_matches = top
        else:
            nr_matches = sparsecols.size
   
        left_side = np.empty([nr_matches], dtype=object)
        right_side = np.empty([nr_matches], dtype=object)
        similarity = np.zeros(nr_matches)
   
        for index in range(0, nr_matches):
            left_side[index] = name_vector[sparserows[index]]
            right_side[index] = name_vector[sparsecols[index]]
            similarity[index] = sparse_matrix.data[index]
   
        return pd.DataFrame({'Values for Matching': left_side,
                             'Matched Values': right_side,
                             'Similarity': similarity})
 
    # Get the tf_idf_matrix
    vals = df[col_name]
    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
    tf_idf_matrix = vectorizer.fit_transform(vals)
 
    # Get the matches using cosine similarity
    matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 5, thres)
   
    # The resulting sparse matrix should be square
    # The resulting sparse matrix should also have the same numer of rows as input data
    assert(matches.shape[0] == df.shape[0])
 
    # Unpack the resulting sparse matrix to get all the matched results
    # The total number of matched results is the number of non-zero elements in sparse matrix
    all_n = len(matches.nonzero()[0])
    matches_df = get_matches_df(matches, vals, top=all_n)
    #print(matches_df.shape)
   
    # Get exact matches
    matches_df_same = matches_df[matches_df['Similarity'] >= 0.99999999999999]
    #print(matches_df_same.shape)
 
    # Get those exact matches
    matches_df_close = matches_df[matches_df['Similarity'] < 0.99999999999999]
    #print(matches_df_close.shape)
 
    # Check shape
    assert(matches_df_same.shape[0] + matches_df_close.shape[0] == matches_df.shape[0])
 
    # Drop duplicate rows
    matches_df_same = matches_df_same.drop_duplicates(keep='first')
    matches_df_close = matches_df_close.drop_duplicates(keep='first')
    # print(matches_df_same.shape, matches_df_close.shape)
 
    # Return results
    return matches_df_same, matches_df_close
