In [2]:
import logging
import pandas as pd
import argparse
import timeit
from datasketch import MinHash, MinHashLSH
from nltk import ngrams
from datetime import date, timedelta
import numpy as np

In [20]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
logger = logging.getLogger(__name__ + '.vaq')

In [21]:
def load_data():
    """
    Reads data from Excel file
    """
    df = pd.read_excel("disney_characters.xlsx")
    
    return df

In [22]:
def clean_data(df):
    """
    Cleans up string columns. Names should only contain letters.
    :param df: dataframe with Disney character names
    """

    logger = logging.getLogger(__name__ + '.clean_data')
    logger.debug(f"Cleaning data...")

    starttime = timeit.default_timer()

    df['local_name_clean'] = df.local_name.str.replace('[^a-zA-Z]', '', regex=True).str.lower()
   
    logger.info(f"Cleaning done after {round(timeit.default_timer() - starttime, 2)} s")

In [23]:
def minhash_data(df_column, threshold, num_perm, weights, nr_ngrams):
    """
    Performs hashing of columns using MinHash
    :param df_column: column to hash
    :param threshold: Jaccard distance threshold,
    :param num_perm: number of permutations
    :param weights: ???
    :param nr_ngrams: shingle size
    :return: lsh, minhash objects
    """

    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm, weights=weights)

    # Create MinHash objects
    minhashes = {}

    for c, i in enumerate(df_column):
        minhash = MinHash(num_perm=num_perm)

        for d in ngrams(i, nr_ngrams):
            minhash.update("".join(d).encode('utf-8'))
        lsh.insert(c, minhash)
        minhashes[c] = minhash

    return lsh, minhashes

In [24]:
def query_lsh(df, minhashes, lsh):
    """
    Queries records in df and if it finds a match appends to a new df
    :param df: dataframe with Disney characters
    :param minhashes: minhashes of the name column
    :param lsh_year: lsh of the name column
    :return: df: df with matched records
    """

    df_matches = pd.DataFrame(columns=['id', 'local_name', 'english_name', 'match_id', 'match_local_name', 
                                       'match_english_name'])

    for i in range(len(df)):
        result = lsh.query(minhashes[i])
        result = [element for element in result if element is not i]

        if len(result) > 0:
            df_i = pd.DataFrame()
            df_i['id'] = df.id[result]
            df_i['local_name'] = df.local_name[result]
            df_i['english_name'] = df.english_name[result]
            df_i['match_id'] = df.id[i]
            df_i['match_local_name'] = df.local_name[i]
            df_i['match_english_name'] = df.english_name[i]

            df_matches = df_matches.append(df_i)

    return df_matches

In [25]:
def lsh_match_name(df, threshold, num_perm, weights, nr_ngrams):
    """
    Creates minhashes on names for new and existing customers. Queries new customer minhashes from existing customers.
    Returns pairs of customer_ids.
    :param df: dataframe with Disney characters
    :param threshold: Jaccard distance threshold
    :param num_perm: number of permuntations
    :param weights:
    :param nr_ngrams: shingle size
    :return:
    """

    logger = logging.getLogger(__name__ + '.lsh_match_name')
    logger.info("Creating minhashes...")

    lsh, minhashes = minhash_data(df.local_name_clean, threshold=threshold, num_perm=num_perm, weights=weights, 
                                  nr_ngrams=nr_ngrams)

    logger.info("Querying minhashes...")
    df_name = query_lsh(df, minhashes, lsh)
    
    df_name = df_name[df_name.id != df_name.match_id]

    return df_name

In [26]:
df = load_data()
df.head()

Unnamed: 0,id,language,english_name,local_name
0,1,Danish,"April, May, and June","Kylle, Pylle og Rylle"
1,2,Danish,Beagle Boys,Bjørnebanden
2,3,Danish,Big Bad Wolf,Store stygge ulv
3,4,Danish,Black Pete,Sorteper
4,5,Danish,Chip 'n Dale,Chip og Chap


In [27]:
df.english_name.value_counts().head(10)

Beagle Boys              20
Mickey Mouse             20
Donald Duck              20
Gyro Gearloose           20
Minnie Mouse             19
Huey, Dewey and Louie    19
Gladstone Gander         19
Daisy Duck               18
Clarabelle Cow           18
Pluto                    18
Name: english_name, dtype: int64

In [28]:
df.local_name.value_counts().head(10)

 Pluto           14
 Goofy            4
 Mickey Mouse     4
 Daisy            4
 Clarabella       3
 Klarabella       3
 Minnie Mouse     3
 Pluton           2
 Madam Mim        2
 Rico McPato      2
Name: local_name, dtype: int64

In [29]:
clean_data(df)
df.head()

2021-06-15 18:59:11,010 - INFO - __main__.clean_data - Cleaning done after 0.0 s


Unnamed: 0,id,language,english_name,local_name,local_name_clean
0,1,Danish,"April, May, and June","Kylle, Pylle og Rylle",kyllepylleogrylle
1,2,Danish,Beagle Boys,Bjørnebanden,bjrnebanden
2,3,Danish,Big Bad Wolf,Store stygge ulv,storestyggeulv
3,4,Danish,Black Pete,Sorteper,sorteper
4,5,Danish,Chip 'n Dale,Chip og Chap,chipogchap


In [30]:
threshold = 0.7
num_perm = 1000
weights = (0.5, 0.5)
nr_ngrams = 2

In [31]:
matched_df = lsh_match_name(df, threshold, num_perm, weights, nr_ngrams)

2021-06-15 18:59:14,579 - INFO - __main__.lsh_match_name - Creating minhashes...
2021-06-15 18:59:19,272 - INFO - __main__.lsh_match_name - Querying minhashes...


In [32]:
matched_df['is_match'] = np.where(matched_df.english_name == matched_df.match_english_name, True, False)

In [33]:
matched_df.head(50)

Unnamed: 0,id,local_name,english_name,match_id,match_local_name,match_english_name,is_match
364,365,Storeulv; Den Store Stygge Ulven,Big Bad Wolf,3,Store stygge ulv,Big Bad Wolf,True
366,367,Politimester Fiks,Chief O'Hara,7,Politimester Striks,Chief O'Hara,True
373,374,Andeby,Duckburg,12,Andeby,Duckburg,True
255,256,"Ripp, Rapp og Rupp","Huey, Dewey and Louie",19,"Rip, Rap og Rup","Huey, Dewey and Louie",True
384,385,Lille hjelper,Little Helper,21,Lille hjælper,Little Helper,True
387,388,Madam Mim,Mad Madam Mim,24,Madam Mim,Mad Madam Mim,True
269,270,Maddama Mimm,Mad Madam Mim,24,Madam Mim,Mad Madam Mim,True
183,184,Madam Mim,Mad Madam Mim,24,Madam Mim,Mad Madam Mim,True
546,547,Mickey Mouse,Mickey Mouse,25,Mickey Mouse,Mickey Mouse,True
61,62,Mickey Mouse,Mickey Mouse,25,Mickey Mouse,Mickey Mouse,True


In [34]:
matched_df[~matched_df.is_match]

Unnamed: 0,id,local_name,english_name,match_id,match_local_name,match_english_name,is_match
560,561,Pepe,Goofy,127,Pepi,Scamp,False
249,250,Andrés Önd,Donald Duck,247,Andrésína Önd,Daisy Duck,False
526,527,Magi,Magica De Spell,248,Maggi,Detective Casey,False
246,247,Andrésína Önd,Daisy Duck,250,Andrés Önd,Donald Duck,False
469,470,Gilberto,Gilbert (Goofy's genius nephew),258,Gilbert,Gilbert,False
352,353,Paperinik,Paperinik(PK),331,Paperino,Donald Duck,False
330,331,Paperino,Donald Duck,353,Paperinik,Paperinik(PK),False
257,258,Gilbert,Gilbert,470,Gilberto,Gilbert (Goofy's genius nephew),False
519,520,Paja Patak,Donald Duck,519,Pata Patak,Daisy Duck,False
530,531,Baja Patak,Scrooge McDuck,520,Paja Patak,Donald Duck,False
