In [1]:
import pandas as pd
import numpy as np
import pickle as pk
from collections import OrderedDict
import math
from scipy import spatial


In [2]:
%run "CoreBGgeneration.ipynb"
%run "prepare_4_EM.ipynb"

Following are all the functions you will use, they are commented and tell you what they do.

In [3]:
def city_wide_dedup(cityparam1,cityparam2,city_name,supplier_name):
    """
    The algorithm that is being run in parallel and matching properties city by city

    Args:
        cityparam1 (DataFrame): The subset of properties from table 1 of a particular city
        cityparam2 (DataFrame): The subset of properties from table 2 of a particular city
        city_name (str): Name of the city we are subsetting
        supplier_name (List): Names of the supplier
    Returns:
        DataFrame: A dataframe same as cityparam1 with predicted matches appended in a new column called "final_mapped"

    """
    pname1,paddr1 = city_wide_distr(cityparam1,0.8,city_name+supplier_name[0])
    pname2,paddr2 = city_wide_distr(cityparam2,0.8,city_name+supplier_name[1])
    
    vector_dict_Pcorename = makecumulativedict(pname1,pname2)
    vector_dict_Pcoreaddr = makecumulativedict(paddr1,paddr2)
    
    cityparam1["final_mapped"]= cityparam1.apply(matchrecord,args=(pname1,paddr1,
                                              cityparam2,
                                              pname2,paddr2,vector_dict_Pcorename,
                                              vector_dict_Pcoreaddr),
                                              axis=1)
    return cityparam1
def city_wide_distr(params,alpha,filename,save=True):    
    """
    Creates required probability distribution of names and addresses

    Args:
        params (DataFrame): The dataframe of cleaned and tokenized properties with names and addresses
        alpha (float): The weight of core words. If alpha is 1, all words are regarded as core words and if alpha is 0 all words are regarded as background
        filename (str): Name to save distributions to pickle file. Would work if save is True.
        save (bool): If save is True, it will save the distribution to pickle file.

    Returns:
        pair: (core probability of words in hotel names, core probability of words in hotel addresses) 
    """
    #Working with only london properties as of now
    names = params.name
    addr = params.address

    try:
        (pcore_name,pcore_addr) = pk.load(open(filename+"distributions.pk",'rb'))
    except:   
        #Bring out the Core and Background distributions from the EM algorithm of name
        pC_distr_name,pB_distr_name = core_algo1(names)
        pC_distr_addr, pB_distr_addr = core_algo1(addr)

        #Generate probability of being a core
        pcore_name = coreprob(pC_distr_name,pB_distr_name,alpha)
        pcore_addr = coreprob(pC_distr_addr,pB_distr_addr,alpha)
    
        if save:
            pk.dump((pcore_name,pcore_addr),open(filename+"distributions.pk","wb"))
            print("File: ", filename+"distributions.pk created")
    
    return pcore_name,pcore_addr

In [4]:
def makecumulativedict(p_corename1,p_corename2):
    """
    This function appends different word:prob dictionary for each table to a single dict with all words.
    (so the new dict has word:[prob1,prob2] for words common in both tables). This function is useful to produce vectors of probabilities (running gen_probvec) 
    
    Args:
        p_corename1 (dict): Dictionary of word:probability from table 1
        p_corename2 (dict): Dictionary of word:probability from table 2

    Returns:
        OrderedDict: A combined dictionary of word:list(probs) of both tables
    """
    vector_dict = OrderedDict([(k,[]) for k in sorted(p_corename1)])
    vector_dict = add_vals_to_vectdic(vector_dict,p_corename1)
    vector_dict = add_vals_to_vectdic(vector_dict,p_corename2)
    return vector_dict

def add_vals_to_vectdic(odict,addict):
    """
    This function creates a dictionary that contains the probabilistic values of words of both the tables.
    So a key would be the word and the value would be a list (of max 2 elements when comparing 2 and if one word doesn't exist in both tables then only 1)

    Args:
        odict (dict): An ordered dictionary passed in with vals as list and keys as words
        addict (dict): The probabilities of each words you want to append to the ordered dictionary

    Returns:
        dict: A ordered dictionary where the keys are words and values are lists of probabilities from different tables

    """
    for key,val in addict.items():  #python3 has changed iteritems to items
        if key in odict:
            odict[key].append(val)
        else:
            odict[key] = []
            odict[key].append(val)
            
    return odict

def gen_probvec(name,pCdict,i):
    """
    Generates a vector of length number of words (each words corresponds to an index). For each hotel name/address the vector this function generates is:
    The core probability of word in each idx corresponding to words of the hotel name/address.
    (So if our total table was- "Hotel A" and "B" the vector would be of dim 3 and for "Hotel A" the vector would be [p(hotel),p(A),0] )
    
    Args:
        name (list): A list of tokenized words
        pCdict (dict): A dictionary containing the probability of word being core as a list for both tables
        i (int): index number indicating whether it comes from table 1 or 2

    Return:
        list: A sparse vector of num_words dimension with probabilities. 
    """
    vec = np.zeros(len(pCdict))
    for word in name:
        idx = list(pCdict.keys()).index(word)
        if len(pCdict[word])==1:
            vec[idx] = pCdict[word][0]
        else:
            vec[idx] = pCdict[word][i]
    
    return vec
    
def genreg_vec(name,pCdict):
    """
    Generates a vector of length number of words (each words corresponds to an index). For each hotel name/address the vector this function generates is:
    For each word in a particular name/address, the vector has a 1 and for the rest 0.
    So if our total table was - "hotel A","b" then the vector would be 3 dimensional and for "hotel A" this function would spit out would be [1,1,0]

    Args:
        name (list): A list of tokenized words
        pCdict (dict): A dictionary containing the probability of a word being core (key=word, val=probability)
    Returns:
        list: A one hot vector of num_of_words dimension 
    """
    vec=np.zeros(len(pCdict))
    for word in name:
        idx = list(pCdict.keys()).index(word) #python3 change from pCdict.keys().index to what is seen here because dict keys needs to be converted to list now
        vec[idx]=1.0
        
    return vec
def check_result(inp1,inp2,idcolumn):
    """
    A function used to check accuracy, and extract values that were predicted wrong.

    Args:
        inp1 (DataFrame): Matched dataframe according to model
        inp2 (DataFrame): The original dataframe to which matching occured with human matched values
        idcolumn (str): The column name where the algorithm outputted the matches
    Returns:
        triplet: Three DataFrames that include the merged dataframe after prediction, the ones that were wrong and the third dataframe with the wrong outputs merged with table 2.

    """
    result = pd.merge(inp1,inp2,left_on=[idcolumn],right_on=["mapped_to_ext_id"])
    falses = result[result[idcolumn]!=result["mapped_to_ext_id_x"]]
    print("Acc: ", 1.0*len(result[result[idcolumn]==result["mapped_to_ext_id_x"]])/len(result))
    print("num falses: ", len(falses))
    print("num total: ", len(result))
    wrongones = pd.merge(falses[["name_x","address_x","mapped_to_ext_id_x",idcolumn]],inp2[["name","address","mapped_to_ext_id"]],left_on=["mapped_to_ext_id_x"],right_on=["mapped_to_ext_id"])
    return result,falses,wrongones

def vect_matchmaker(row, vect_name,vect_addr, data2):
    """
    A wrapper to apply cossine similarity measure for each row in dataframe2. This function is used as an apply function.

    Args:
        row (Series): A row of a dataframe1
        vect_name (dict): An ordered dictionary of words of names from both DataFrames
        vect_addr (dict):  An ordered dictionary of words of addresses from both DataFrames
        data2 (DataFrame): The whole second dataframe to match with
    Returns:
        Series: A series of cossine similarities of row with every single entry in data2 

    """
    measure = data2.apply(get_cossim,args=(row,vect_name,vect_addr),axis=1)
    return measure

def get_cossim(row2,row1,vect_name,vect_addr):
    """
    Gets the cossine similarity between two vectors. It concatenates vectors of name and address into a larger dimensional vector and computes the cossine similarity
    This function is used as an apply function.

    Args:
        row2 (Series): A row of a dataframe2
        row1 (Series): A row of a dataframe1
        vect_name (dict): The cumulative vector dictionary made from makecumulativedict for names
        vect_addr (dict): The cumulative vector dictionary made from makecumulativedict for addresses
    
    Returns:
        float: The similarity measure

    """
    genvec2 = np.concatenate([genreg_vec(row2.loc["name"],vect_name), genreg_vec(row2.loc["address"],vect_addr)])
    genvec1 = np.concatenate([genreg_vec(row1.loc["name"],vect_name), genreg_vec(row1.loc["address"],vect_addr)])

    #genvec2 = genreg_vec(row2.loc["address"],vect_addr)
    #genvec1 = genreg_vec(row1.loc["address"],vect_addr)
    sim_measure_gen = 1-spatial.distance.cosine(genvec1,genvec2)
    if math.isnan(sim_measure_gen):
                sim_measure_gen = 0.0
            
    return sim_measure_gen

def haverdist(row2,row1):
    """
    A function that computes haversine distance between two pairs of latitudes and longitudes
    Used as an apply function
    
    Args:
        row2 (Series): A row of a dataframe2
        row1 (Series): A row of a dataframe1

    Returns:
        float: 1-haversine_distance between entry in row1 and entry in row2. So 1 indicates on top of each other, while 0 indicates very far.
    
    """
    if pd.isnull(row1.latitude) | pd.isnull(row2.latitude) | pd.isnull(row1.longitude) | pd.isnull(row2.longitude):
        return 0.0
        
    lat1 = math.radians(row1.latitude)
    lon1 = math.radians(row1.longitude)
    lat2 = math.radians(row2.latitude)
    lon2 = math.radians(row2.longitude)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    
    return (1-c)




def distprobs(row,data2):
    """
    A wrapper to apply haversine distance metric to entries between a row in dataframe1 to every entry in dataframe2

    Args:
        row (Series): A row of a dataframe1
        data2 (DataFrame): The dataframe to match dataframe1 with.

    Returns:
        Series: A series of haveresine closeness for entry in row 1 with every entry in data2
    """
    dists = data2.apply(haverdist,args=(row,),axis=1)
    return dists



def matching(row2,row1,pCname_self,pCaddr_self, pCsent_name, pCsent_addr):
    """
    This function runs the dedup algorithm from CoreBGgeneration for hotel names and addresses.

    Args:
        row2 (Series): A row of a dataframe2
        row1 (Series): A row of a dataframe1
        pCname_self: The word core probabilities for hotel names in dataframe2
        pCaddr_self: The word core probabilities for hotel addresses in dataframe2
        pCsent_name: The word core probabilities for hotel names in dataframe1
        pCsent_addr: The word core probabilities for hotel addresses in dataframe1

    Returns:
        float: The probability of being a match.
    """
    name_prob = dedup(row1.loc["name"],row2.loc["name"],pCsent_name,pCname_self)
    addr_prob = dedup(row1.loc["address"],row2.loc["address"],pCsent_addr,pCaddr_self)
    #print "name:address probability, ", name_prob, address_prob
    return name_prob*(addr_prob+0.2*name_prob)
    

def matchrecord(row,pCname_self,pCaddr_self,data2,pCname2,pCaddr2,vect_name,vect_addr):
    """
    This function combines all different models and methods into one and provides the best match as result.
    .. note:: This needs to improve in terms of how much weight/how much should we listen to which model. This is a common ensemble method problem, and should be solved with the training set provided.
    
    Args:
        row (Series): A row of a dataframe1
        pCname_self (dict): The word core probabilities for hotel names in dataframe1
        pCaddr_self (dict): The word core probabilities for hotel addresses in dataframe1
        data2 (DataFrame): The dataframe to match with
        pCname2 (dict): The word core probabilities for hotel names in dataframe2
        pCaddr2 (dict): The word core probabilities for hotel addresses in dataframe2
        vect_name (dict): An ordered dictionary of words of names from both DataFrames
        vect_addr (dict): An ordered dictionary of words of addresses from both DataFrames

    Returns:
        str: The index with the maximum probability of being a match, or "none" if none found

    """

    probs=data2.apply(matching,args=(row,pCname2,pCaddr2,pCname_self,pCaddr_self),axis=1)
    
    
    vect_prob = vect_matchmaker(row,vect_name,vect_addr,data2)

    latlongprob = distprobs(row,data2)
    
    #fnalprobs = 0.14*probs + 0.86*vect_prob + latlongprob
    fnalprobs = vect_prob + latlongprob

    if (probs + vect_prob).max()<=1e-04:
        return "none"
    
    return data2.loc[fnalprobs.idxmax(),"mapped_to_ext_id"]


In [5]:
def create_distribution(prop1,prop2,supplier_name1,supplier_name2,city_name):
    """
    This function is the heart. Running this runs your whold deduplication between supplier_name1 and supplier_name2.


    Args:
        prop1 (DataFrame): DataFrame object for property 1 we are using to compare
        prop2 (DataFrame): DataFrame object for property 2 we are using to compare
        supplier_name1 (str): Name of table where we got prop1 properties from 
        supplier_name2 (str): Name of table where we got prop2 properties from

    Returns:
        DataFrame: The overall matched resolved dataframe.

    """
    ccolsaddr1 = get_relevcols(prop1,"address")
    ccolsname1 = get_relevcols(prop1,"name")
    
    ccolsaddr2 = get_relevcols(prop2,"address")
    ccolsname2 = get_relevcols(prop2,"name")

    def appendercols(prop,ccols,colname):
        val = dict()
        for c in ccols:
            val[c] = ""
        prop.fillna(value=val)
        prop[colname] = ""
        for c in ccols:
            prop[colname]+=prop[c]+" "
            del prop[c]
            
    if len(ccolsaddr1)>1:
        appendercols(prop1,ccolsaddr1,"address")
    if len(ccolsname1)>1:
        appendercols(prop1,ccolsname1,"name")
        
    if len(ccolsaddr2)>1:
        appendercols(prop2,ccolsaddr2, "address")
    if len(ccolsname2)>1:
        appendercols(prop2,ccolsname2, "name")
    
    try:
        params1 = pk.load(open("emfeatures_"+supplier_name1+"_"+city_name+".pk","rb"))
    except:
        params1 = prepareem(prop1,supplier_name1)
    try:
        params2 = pk.load(open("emfeatures_"+supplier_name2+"_"+city_name+".pk","rb"))
    except:
        params2 = prepareem(prop2,supplier_name2)
    
    citynames1 = params1.city.unique().tolist()
    
    
    citynames2 = params2.city.unique().tolist()
    cities = set(citynames1).intersection(set(citynames2))

    result = []
    supp_names = [supplier_name1,supplier_name2]
    for i,c in enumerate(cities):
        p1 = params1[hotelbeds_props.city==c]
        p2 = params2[taap_props.city==c]
        result.append(city_wide_dedup(p1,p2,c,supp_names))

    return result
def df_gen(dat):
    dat.columns = map(str.lower,dat.columns)
    dat = dat[pd.notnull(dat.mapped_to_ext_id)]
    dat[["latitude","longitude"]] = dat[["latitude","longitude"]].replace(0.0,np.nan)

    return dat 
def appendercols(prop,ccols,colname):
    val = dict()
    for c in ccols:
        val[c] = ""
    prop.fillna(value=val)
    prop[colname] = ""
    for c in ccols:
        prop[colname]+=prop[c]+" "
        del prop[c]


The following is how to run the code to get the results you want. First we shall generate the code by running the mongo client and picking out the necessary city. In the example below I have used London. prop_* are all dataframes with city london. 

In [38]:
client = MongoClient()
prop_lon_taap = pd.DataFrame(list(client['supplier_static_database'].taap_properties.find({"City": "London", "Country": "GBR"}))) 
prop_lon_idb = pd.DataFrame(list(client['inventorydb'].property.find({"city_name": "London", "country": 243})))
prop_lon_taap.columns = map(str.lower,prop_lon_taap.columns) #before it was unicode.lower -- python3 has apparantly updated unicode to str, and the old str to bytes
prop_lon_idb.columns = map(str.lower, prop_lon_idb.columns) #before it was unicode.lower
prop_lon_taap = prop_lon_taap[~pd.isnull(prop_lon_taap.mapped_to_ext_id)]
prop_lon_idb = prop_lon_idb[~pd.isnull(prop_lon_idb.ext_id)]

prop_lon_idb["mapped_to_ext_id"] = prop_lon_idb.ext_id

In [39]:
prop_lon_idb.replace("",np.nan,inplace=True)
prop_lon_idb[["latitude","longitude"]] = prop_lon_idb[["latitude","longitude"]].replace(0.0,np.nan)


In [40]:
#idb_params = prepareem(prop_lon_idb,"inventorydb_LON")
prop_subdf = prop_lon_idb
stop_words,stemingdic = loadfilters()

#Clean data and tokenize
namedat=  prop_subdf[get_relevcols(prop_subdf,'name')].iloc[:,0].str.lower()
addrdat = prop_subdf[get_relevcols(prop_subdf,'address')].iloc[:,0].str.lower()
#namedat = prop_lon_idb.name
#addrdat = prop_lon_idb.address

namedat = namedat.apply(gramclean, args=(stop_words,))
addrdat = addrdat.apply(gramclean, args=(stop_words,))

#Transliterate non ASCII characters
namedat = namedat.apply(translittunicode)
addrdat = addrdat.apply(translittunicode)

#Stem certain common words
namedat = namedat.apply(stemmer, args=(stemingdic,))
addrdat = addrdat.apply(stemmer, args=(stemingdic,))

The preclean function generates the emfeatures that get stored in the pickle file showed above. Here as an example I have pre-cleaned and generated the emfeatures for london properties in inventory_database. This process usually takes some time so there is benefit to speeding this up here.

In [41]:
emfeatures = pd.DataFrame({"name": namedat,
                "address": addrdat,
                "city": prop_lon_idb['city_name'].str.lower(),
                "latitude":prop_lon_idb['latitude'],
                "longitude":prop_lon_idb['longitude'],
                "mapped_to_ext_id":prop_lon_idb["ext_id"]
                              })


emfeatures.city = emfeatures.city.apply(translittunicode)
preclean(emfeatures.name)
preclean(emfeatures.address)

0                             [66, knightsbridge]
1       [1525, hogarth, rd, earl, ct, kensington]
2                 [50, lancaster, gatehyde, park]
3               [100, queen, gtwy, s, kensington]
4              [60, hyde, park, gtwy, kensington]
                          ...                    
5729                            [12, norwich, st]
5730                  [fl, 4b, 153, cromwell, rd]
5731                          [191, cromwell, rd]
5732                    [fl, 3739, arkwright, rd]
5733                 [fl, gdn, 142, cromwell, rd]
Name: address, Length: 5734, dtype: object

In [42]:
#Dump the tokenized and stemmed data into a pickle
pk.dump(emfeatures,open("emfeatures_"+"inventorydb_LON_with_get_relev_cols"+".pk","wb"))

In [6]:
#This is an example of opening these tokenized features from a generated pickle file.
with open("../pickles/emfeatures_taap_LON.pk",'rb') as f:
    taap_props =  pk.load(f, encoding='latin1')

with open("./emfeatures_inventorydb_LON_with_get_relev_cols.pk",'rb') as f:
    idb_params =  pk.load(f, encoding='latin1')

Here is the portion of the code that matches and does entitiy resolution on the two dataframes provided. This could be speeded up as well. city_wide_dedup function takes care of the deduplication and returns a deduplicated dataframe.

In [7]:
#idb_params= emfeatures
citynames1 = idb_params.city.unique().tolist()
citynames2 = taap_props.city.unique().tolist()
cities = set(citynames1).intersection(set(citynames2))
result = []
supp_names = ["inventory_db","taap"]
for i,c in enumerate(cities):
    p1 = idb_params[idb_params.city==c]
    p2 = taap_props[taap_props.city==c]
    result.append(city_wide_dedup(p1,p2,c,supp_names))

#Store the result into CSV
result[0].to_csv("taap_inventorydb_LON_dedup.csv",index=False)

In [None]:
contact sheikh@wwstay.com 