## LSH For Disney Names Matching

### Functions

In [1]:
import logging
import pandas as pd
import argparse
import timeit
from datasketch import MinHash, MinHashLSH
from nltk import ngrams
from datetime import date, timedelta
import numpy as np

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
logger = logging.getLogger(__name__ + '.vaq')

In [3]:
def load_data():
    """
    Reads data from Excel file
    """
    df = pd.read_excel("disney_characters.xlsx")
    
    return df

In [4]:
def clean_data(df):
    """
    Cleans up string columns. Names should only contain letters.
    :param df: dataframe with Disney character names
    """

    logger = logging.getLogger(__name__ + '.clean_data')

    starttime = timeit.default_timer()

    df['local_name_clean'] = df.local_name.str.replace('[^a-zA-Z]', '', regex=True).str.lower()
   
    logger.info(f"Cleaning done after {round(timeit.default_timer() - starttime, 2)} s")

In [5]:
def minhash_data(df_column, threshold, num_perm, weights, nr_ngrams):
    """
    Performs hashing of columns using MinHash
    :param df_column: column to hash
    :param threshold: Jaccard distance threshold
    :param num_perm: number of permutations
    :param weights: false positive and false negative weights
    :param nr_ngrams: shingle size
    :return: lsh, minhash objects
    """

    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm, weights=weights)

    # Create MinHash objects
    minhashes = {}

    for c, i in enumerate(df_column):
        minhash = MinHash(num_perm=num_perm)

        for d in ngrams(i, nr_ngrams):
            minhash.update("".join(d).encode('utf-8'))
        lsh.insert(c, minhash)
        minhashes[c] = minhash

    return lsh, minhashes

In [6]:
def query_lsh(df, minhashes, lsh):
    """
    Queries records in df and if it finds a match appends to a new df. Giving the MinHash of the query set, 
    retrieve the keys that references sets with Jaccard similarities greater than the threshold.
    :param df: dataframe with Disney characters
    :param minhashes: minhashes of the name column
    :param lsh: lsh of the name column
    :return: df_matches: df with pairs of matched records
    """

    df_matches = pd.DataFrame(columns=['id', 'local_name', 'english_name', 'match_id', 'match_local_name', 
                                       'match_english_name'])

    for i in range(len(df)):
        result = lsh.query(minhashes[i])
        result = [element for element in result if element is not i]

        if len(result) > 0:
            df_i = pd.DataFrame()
            df_i['id'] = df.id[result]
            df_i['local_name'] = df.local_name[result]
            df_i['english_name'] = df.english_name[result]
            df_i['match_id'] = df.id[i]
            df_i['match_local_name'] = df.local_name[i]
            df_i['match_english_name'] = df.english_name[i]

            df_matches = df_matches.append(df_i)

    return df_matches

In [7]:
def lsh_match_name(df, threshold, num_perm, weights, nr_ngrams):
    """
    Creates minhashes on Disney character names. Using LSH returns pairs of matched names. 
    :param df: dataframe with Disney characters
    :param threshold: Jaccard distance threshold
    :param num_perm: number of permuntations
    :param weights: false positive and false negative weights
    :param nr_ngrams: shingle size
    :return:
    """

    logger = logging.getLogger(__name__ + '.lsh_match_name')
    logger.info("Creating minhashes...")

    lsh, minhashes = minhash_data(df.local_name_clean, threshold=threshold, num_perm=num_perm, weights=weights, 
                                  nr_ngrams=nr_ngrams)

    logger.info("Querying minhashes...")
    df_name = query_lsh(df, minhashes, lsh)
    
    df_name = df_name[df_name.id != df_name.match_id]

    return df_name

### Read data

In [8]:
df = load_data()
df.head()

Unnamed: 0,id,language,english_name,local_name
0,1,Danish,"April, May, and June","Kylle, Pylle og Rylle"
1,2,Danish,Beagle Boys,Bjørnebanden
2,3,Danish,Big Bad Wolf,Store stygge ulv
3,4,Danish,Black Pete,Sorteper
4,5,Danish,Chip 'n Dale,Chip og Chap


In [36]:
df.english_name.value_counts().head()

Donald Duck       20
Beagle Boys       20
Gyro Gearloose    20
Mickey Mouse      20
Minnie Mouse      19
Name: english_name, dtype: int64

In [37]:
df.local_name.value_counts().head(5)

 Pluto           14
 Daisy            4
 Mickey Mouse     4
 Goofy            4
 Minnie Mouse     3
Name: local_name, dtype: int64

### Clean data

- delete non alpha characters
- convert to lower case

In [11]:
clean_data(df)
df.head()

2021-06-17 09:28:35,509 - INFO - __main__.clean_data - Cleaning done after 0.0 s


Unnamed: 0,id,language,english_name,local_name,local_name_clean
0,1,Danish,"April, May, and June","Kylle, Pylle og Rylle",kyllepylleogrylle
1,2,Danish,Beagle Boys,Bjørnebanden,bjrnebanden
2,3,Danish,Big Bad Wolf,Store stygge ulv,storestyggeulv
3,4,Danish,Black Pete,Sorteper,sorteper
4,5,Danish,Chip 'n Dale,Chip og Chap,chipogchap


### LSH

In [12]:
# parameters
threshold = 0.7
num_perm = 120
weights = (0.5, 0.5)
nr_ngrams = 2

In [13]:
# run matching
matched_df = lsh_match_name(df, threshold, num_perm, weights, nr_ngrams)
matched_df['is_match'] = np.where(matched_df.english_name == matched_df.match_english_name, True, False)

2021-06-17 09:28:36,703 - INFO - __main__.lsh_match_name - Creating minhashes...
2021-06-17 09:28:37,407 - INFO - __main__.lsh_match_name - Querying minhashes...


In [35]:
# sample of matched characters where they are actually identical
matched_df[matched_df.is_match].head()

Unnamed: 0,id,local_name,english_name,match_id,match_local_name,match_english_name,is_match
373,374,Andeby,Duckburg,12,Andeby,Duckburg,True
255,256,"Ripp, Rapp og Rupp","Huey, Dewey and Louie",19,"Rip, Rap og Rup","Huey, Dewey and Louie",True
384,385,Lille hjelper,Little Helper,21,Lille hjælper,Little Helper,True
387,388,Madam Mim,Mad Madam Mim,24,Madam Mim,Mad Madam Mim,True
269,270,Maddama Mimm,Mad Madam Mim,24,Madam Mim,Mad Madam Mim,True


In [33]:
# sample of matched characters where they are actually NOT identical - false positives
matched_df[~matched_df.is_match].head()

Unnamed: 0,id,local_name,english_name,match_id,match_local_name,match_english_name,is_match
298,299,Gufi,Goofy,17,Fætter Guf,Gus Goose,False
260,261,Guffi,Goofy,17,Fætter Guf,Gus Goose,False
582,583,Gustavo Ganso,Gus Goose,174,Gustav Gans,Gladstone Gander,False
331,332,Pennino,Dugan Duck,188,Minni Maus,Minnie Mouse,False
526,527,Magi,Magica De Spell,248,Maggi,Detective Casey,False


### Comparison to exact matches

In [32]:
# Compare LSH to exact matches
df_english_merge = df.merge(df, on='english_name')
df_english_merge = df_english_merge[df_english_merge.id_x != df_english_merge.id_y]

df_local_merge = df.merge(df, on='local_name')
df_local_merge = df_local_merge[df_local_merge.id_x != df_local_merge.id_y]

print(f"Number of matches using english name: {df_english_merge.shape[0]}")
print(f"Number of matches using cleaned local name: {df_local_merge.shape[0]}")
print(f"Number of matches using LSH: {matched_df.shape[0]}")

Number of matches using english name: 7336
Number of matches using cleaned local name: 270
Number of matches using LSH: 492
