In [35]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings("ignore")

# validate pairs by other attributes

In [36]:
def validate(predictions, address_locu, address_foursquare, attr):
    locu_y = address_locu["id"]
    foursquare_y = address_foursquare["id"]
    count = 0
    matched = {}
    Nmatched = {}
    locu = []
    fq = []
    
    for key, value in predictions.items():
        locu_key = address_locu.index[locu_y == key].tolist()
        fs_key = address_foursquare.index[foursquare_y == value].tolist()
        
        if (address_locu[attr][locu_key].tolist()) == (address_foursquare[attr][fs_key].tolist()):
            matched[key] = value
            count += 1
            locu.append(key)
            fq.append(value)
        else:
            locu_value = address_locu["phone"][locu_key].tolist()
            fs_value = address_foursquare["phone"][fs_key].tolist()
            if locu_value == fs_value:
                matched[key] = value
                count += 1
                locu.append(key)
                fq.append(value)
                
            else:
                locu_zipcode= address_locu["latitude"][locu_key].tolist()
                fq_zipcode= address_foursquare["latitude"][fs_key].tolist()
                if locu_zipcode == fq_zipcode:
                    
                    matched[key] = value
                    count += 1
                    locu.append(key)
                    fq.append(value)
                else:
                    Nmatched[key] = value
            
    print ("validated ", count)
    #print (len(matched.keys()))
    return matched, Nmatched, locu, fq

# parse address

In [37]:
import usaddress
def AddressParse(df, attrs):
    
    df1 = df[df[attrs].apply(lambda x: len(x) != 0)]
    index = df1.index.tolist()
    for ind in index:
        try:
            tags = dict(usaddress.tag(df[attrs][ind])[0])
            key = list(tags.keys())
            if 'AddressNumber' in key:
                df['AddressNumber'][ind] = tags['AddressNumber']
                
            if 'StreetName' in key:
                df['StreetName'][ind] = tags['StreetName']
                
            if 'StreetNamePreDirectional' in key:
                df['StreetNamePreDirectional'][ind] = tags['StreetNamePreDirectional']
            
        except:
            pass
    return df


# address, phone normalization

In [38]:
# normalize two dataframe
def Normalize(df):
    df["phone"].replace(np.nan, '', inplace = True)
    df["phone"] = df["phone"].apply(lambda x: x.replace('-', '').replace(')', '').replace('(', '').replace(' ', ''))
    df['StreetNamePreDirectional'] = df['StreetNamePreDirectional'].apply(lambda x: x.replace('W.', 'West').replace('E.', 'East').replace('N.', 'North').replace('S.','South'))
    
    print ("finish normalization")
    return df


# URL parse

In [39]:
from urllib.parse import urlparse
def UrlParse(df, attrs):
    df1 = df[df[attrs].apply(lambda x: len(x) != 0)]
    index = df1.index.tolist()
    for ind in index:
        o = urlparse(df[attrs][ind])
        df["url"][ind] = o.netloc
    return df


In [40]:
def generateCluster(fit_y, predict_y):
    clusters = {}
    records = fit_y.shape[0]
    print (records)

    for ind in range(records):
        key = fit_y[ind]
        if key in clusters.keys():
            clusters[key].append(predict_y[ind])
            print ("same map")
        else:
            clusters[key] = predict_y[ind]
    
    return clusters

In [41]:
def preprocess(testdf):
    # parse street address to AddressNumber, StreetName, StreetNamePreDirectional
    testdf['AddressNumber'] = ''
    testdf['StreetName'] = ''
    testdf['StreetNamePreDirectional'] = ''
    testdf = AddressParse(testdf, "street_address")
    
    # Normalization
    testdf = Normalize(testdf)
    
    # Urlparse
    testdf["url"] = ''
    testdf = UrlParse(testdf, "website")
    
    print ("finish preprocessing")
    return testdf

# KNN
# No record duplicate in each file

In [42]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.preprocessing import StandardScaler
def train_knn(model, address_foursquare, address_locu):
    select_col = ["latitude", "longitude"]#, "postal_code"]
    foursquareTrain = address_foursquare[select_col]
    locuTrain = address_locu[select_col]
    
    foursquare_y = address_foursquare["id"]
    locu_y = address_locu["id"]

    #foursquareTrain["postal_code"] = pd.to_numeric(foursquareTrain["postal_code"])
    foursquareTrain.fillna(0, inplace = True)
    #locuTrain["postal_code"] = pd.to_numeric(locuTrain["postal_code"])
    locuTrain.fillna(0, inplace = True)
    
    if model == "svc":
        scaler = StandardScaler()
        foursquareTrain= scaler.fit_transform(foursquareTrain)
        locuTrain = scaler.transform(locuTrain)
        clf = svm.SVC(decision_function_shape='ovo')
    elif model == "rf":
        #clf = RandomForestClassifier(max_leaf_nodes=None, random_state=0)
        clf = DecisionTreeClassifier(max_leaf_nodes=None, random_state=0)
        #clf.estimators = 20
    elif model == "knn":
        clf = KNeighborsClassifier(n_neighbors=1)
    

    clf.fit(foursquareTrain,foursquare_y)
    locu_predict = clf.predict(locuTrain)

    predictions = dict(zip(locu_y, locu_predict))

    knn_predictions, Nvalidated, locu, fq = validate(predictions, address_locu, address_foursquare, "name")
    
    # delete ones already matched
    train_foursquare = address_foursquare[~address_foursquare["id"].isin(fq)]
    train_locu = address_locu[~address_locu["id"].isin(locu)]
    
    return knn_predictions, Nvalidated, train_foursquare, train_locu


# Compare Address

In [43]:
def StringCompare(address_foursquare, address_locu, attrs):
    foursquare_Address = address_foursquare[address_foursquare[attrs].apply(lambda x: len(x) != 0)]

    foursquareAddress = foursquare_Address[attrs] # 185
    foursquareIndex = foursquareAddress.index.tolist()
    
    locu_Address = address_locu[address_locu[attrs].apply(lambda x: len(x) != 0)]
    locuAddress = locu_Address[attrs] # 260
    locuIndex = locuAddress.index.tolist()
    
    candidate_links = pd.MultiIndex.from_product([locuIndex, foursquareIndex],names=['locu', 'foursquare'])
    
    return locuIndex, foursquareIndex, locu_Address, foursquare_Address, candidate_links

In [59]:
import recordlinkage
def Levenshtein(candidate_links, locuIndex, locu_Address, foursquare_Address, attrs):
    comp = recordlinkage.Compare()
    for att in attrs:
        comp.string(att, att, method='damerau_levenshtein')
    
        
    levenDist = comp.compute(candidate_links, locu_Address, foursquare_Address)
    
    records = locu_Address.shape[0]
    pairs = {}
    for index in locuIndex:
        level = levenDist.loc[levenDist.index.get_level_values("locu") == index].reset_index()
        level['distance'] = level[0]**2 + level[1]**2 + level[2]**2
        co = np.argmax(level['distance'].tolist())
        if level['distance'].tolist()[co] > 2.5:
            match_index = level['foursquare'][co]
            if locu_Address["name"][index] != "Lizzie's Restaurant":
                pairs[locu_Address["id"][index]] = foursquare_Address["id"][match_index]
                print (locu_Address["street_address"][index], '$$$', foursquare_Address['street_address'][match_index], level['distance'].tolist()[co])
                print (locu_Address["name"][index], '$$$', foursquare_Address['name'][match_index], level['distance'].tolist()[co])
                
    return pairs


In [60]:
def AddressCompare(knn_foursquare, knn_locu, attrs):
    locuIndex, foursquareIndex, locu_Address, foursquare_Address, candidate_links = StringCompare(knn_foursquare, knn_locu, attrs)
    attributes = ['AddressNumber', 'StreetName', 'StreetNamePreDirectional']
    levenshtainpairs = Levenshtein(candidate_links, locuIndex, locu_Address, foursquare_Address, attributes)

    matched, Nmatched, locu, fq = validate(levenshtainpairs, knn_locu, knn_foursquare, "name")
    left_foursquare = knn_foursquare[~knn_foursquare["id"].isin(fq)]
    left_locu = knn_locu[~knn_locu["id"].isin(locu)]
    return matched, Nmatched, left_foursquare, left_locu, levenshtainpairs


# Compare phone/name

In [61]:
def PhoneCompare(train_foursquare, train_locu, attribute):
    notnull_foursquare = train_foursquare[train_foursquare[attribute] != ''] 
    notnull_locu = train_locu[train_locu[attribute] != '']
    
    columns = ["id", attribute]
    compare_foursquare = notnull_foursquare[columns]
    compare_locu = notnull_locu[columns]
    
    # merge attribute
    compare_result = compare_foursquare.merge(compare_locu, left_on = compare_foursquare[attribute], right_on=compare_locu[attribute], how = "inner")
    
    compare_cols = ["id_x", "id_y"] # x: foursquare, y: locu
    predict = compare_result[compare_cols]
    predict = predict.set_index("id_y")
    predict = predict.to_dict()
    compare_predict = predict["id_x"]
    
    matched, Nmatched, locu, fq = validate(compare_predict, train_locu, train_foursquare, "name")
    
    train_foursquare = train_foursquare[~train_foursquare["id"].isin(fq)]
    train_locu = train_locu[~train_locu["id"].isin(locu)]
    
    return matched, Nmatched, train_foursquare, train_locu


# URL

In [62]:
def urlLevenshtein(candidate_links, locuIndex, locu_Address, foursquare_Address, attrs):
    comp = recordlinkage.Compare()
    comp.string(attrs, attrs, method='damerau_levenshtein')
    levenDist = comp.compute(candidate_links, locu_Address, foursquare_Address)
    
    records = locu_Address.shape[0]
    pairs = {}
    for index in locuIndex:
        level = levenDist.loc[levenDist.index.get_level_values("locu") == index].reset_index()
        co = np.argmax(level[0].tolist())
        match_index = level['foursquare'][co]
        if level[0].tolist()[co] > 0.8 and (locu_Address["postal_code"][index] == foursquare_Address['postal_code'][match_index]):
            pairs[locu_Address["id"][index]] = foursquare_Address["id"][match_index]
            print (locu_Address["name"][index], '$$$', foursquare_Address['name'][match_index], level[0].tolist()[co])
            print (locu_Address["postal_code"][index], '$$$', foursquare_Address['postal_code'][match_index], level[0].tolist()[co])
    return pairs


In [63]:
def UrlCompare(name_foursquare, name_locu, attrs):
    locuIndex, foursquareIndex, locu_Address, foursquare_Address, candidate_links = StringCompare(name_foursquare, name_locu, attrs)
    urlpairs = urlLevenshtein(candidate_links, locuIndex, locu_Address, foursquare_Address, attrs)    
    return urlpairs

# LEFT

In [64]:
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return str(only_ascii)

In [65]:
def nameLevenshtein(candidate_links, locuIndex, locu_Address, foursquare_Address, attrs):
    comp = recordlinkage.Compare()
    comp.string(attrs, attrs, method='damerau_levenshtein')
    levenDist = comp.compute(candidate_links, locu_Address, foursquare_Address)
    
    records = locu_Address.shape[0]
    pairs = {}
    for index in locuIndex:
        level = levenDist.loc[levenDist.index.get_level_values("locu") == index].reset_index()
        co = np.argmax(level[0].tolist())
        match_index = level['foursquare'][co]
        locu_add = locu_Address["street_address"][index]
        foursquare_add = foursquare_Address["street_address"][match_index]
        locu_zip = locu_Address["postal_code"][index]
        foursquare_zip = foursquare_Address["postal_code"][match_index]
        
        log1 = len(locu_add) == 0 or len(foursquare_add)==0 # one of addresses missing
        log2 = locu_zip == foursquare_zip # same zip code
        log3 = ((len(locu_zip)==0) or len(foursquare_zip) ==0) # one of zipcodes missing
        
        if level[0].tolist()[co] > 0.9:
            pairs[locu_Address["id"][index]] = foursquare_Address["id"][match_index]
            print (locu_Address["name"][index], '$$$', foursquare_Address['name'][match_index], level[0].tolist()[co])
            print (locu_Address["phone"][index], '$$$', foursquare_Address['phone'][match_index], level[0].tolist()[co])
        elif level[0].tolist()[co] > 0.7 and log1 and (log2 or log3) :
            print (locu_Address["name"][index], '$$$', foursquare_Address['name'][match_index], level[0].tolist()[co])
            print (locu_Address["postal_code"][index], '$$$', foursquare_Address['postal_code'][match_index], level[0].tolist()[co])
            pairs[locu_Address["id"][index]] = foursquare_Address["id"][match_index]
    return pairs


In [66]:
def NameCompare(name_foursquare, name_locu, attrs):
    name_foursquare[attrs] = name_foursquare[attrs].apply(lambda x: remove_accents(x))
    name_locu[attrs] = name_locu[attrs].apply(lambda x: remove_accents(x))

    locuIndex, foursquareIndex, locu_Address, foursquare_Address, candidate_links = StringCompare(name_foursquare, name_locu, attrs)
    namepairs = nameLevenshtein(candidate_links, locuIndex, locu_Address, foursquare_Address, attrs)
    
    return namepairs

# Pipeline

In [67]:
# test dataset left untouched
test_foursquare = pd.read_json("online_competition/foursquare_test.json")
test_locu = pd.read_json("online_competition/locu_test.json")

In [68]:
# train dataset
train_foursquare = pd.read_json("train/foursquare_train.json")
train_locu = pd.read_json("train/locu_train.json")
match = pd.read_csv("train/matches_train.csv")

In [69]:
def PipeLine(test_locu, test_foursquare):
    locu = preprocess(test_locu)
    foursquare = preprocess(test_foursquare)
    # rf
    rf_validated, rf_Nvalidated, rf_foursquare, rf_locu= train_knn("rf",foursquare, locu)
    # SVC
    knn_validated, knn_Nvalidated, knn_foursquare, knn_locu= train_knn("svc",rf_foursquare, rf_locu)
    # Compare Address after KNN
    address_matched, address_Nmatched, left_foursquare, left_locu, addresspairs = AddressCompare(knn_foursquare, knn_locu, attrs = "street_address")
    # Compare Phone & Name after Address Compare
    phone_matched, phone_Nmatched, phone_foursquare, phone_locu = PhoneCompare(left_foursquare, left_locu, "phone")
    name_matched, name_Nmatched, name_foursquare, name_locu = PhoneCompare(left_foursquare, left_locu,"name")
    # Compare url after Name Compare
    urlnamepairs = UrlCompare(name_foursquare, name_locu, attrs = "url")
    #urlphonepairs = UrlCompare(phone_foursquare, phone_locu, attrs = "url")
    # Compare Name after Name Compare
    namepairs = NameCompare(name_foursquare, name_locu, attrs = "name")
    #namephonepairs = NameCompare(phone_foursquare, phone_locu, attrs = "name")
    
    return rf_validated, knn_validated, addresspairs, phone_matched, name_matched, urlnamepairs,  namepairs

#knn_validated, addresspairs, phone_matched, name_matched, urlpairs, namepairs = PipeLine(test_locu, test_foursquare)

In [70]:
def final(locu, foursquare):
    rf_validated, knn_validated, addresspairs, phone_matched, name_matched, urlpairs,  namepairs = PipeLine(locu, foursquare)
    
    result = addresspairs.copy()
    print (len(result.keys()))
    result.update(rf_validated)
    print (len(result.keys()))
    result.update(knn_validated)
    print (len(result.keys()))

    result.update(name_matched)
    print (len(result.keys()))

    result.update(phone_matched)
    print (len(result.keys()))

    result.update(namepairs)
    print (len(result.keys()))
    
    #result.update(namephonepairs)
    #print (len(result.keys()))

    result.update(urlpairs)
    print (len(result.keys()))
    
    #result.update(namephonepairs)
    #print (len(result.keys()))
    
    return result

In [26]:
result = final(test_locu, test_foursquare)

finish normalization
finish preprocessing
finish normalization
finish preprocessing
validated  184
validated  40
321 East Houston St. $$$ 321 E. Houston St. 3.0
El Maguey y la Tuna $$$ El Maguey y La Tuna 3.0
356 W. 58th St. $$$ 356 W. 58th St. 3.0
Hudson Common @ the Hudson $$$ The Library at Hudson Hotel 3.0
40 West 8th St. $$$ 40 W. 8th St. 3.0
Curry Kitchen $$$ Curry Kitchen 3.0
424 East 9th St. $$$ 424 E. 9th St. 3.0
Exchange Alley $$$ Exchange Alley 3.0
315 West 36th St. $$$ 315 W. 36th St. 3.0
Staghorn Steakhouse $$$ staghorn steakhouse 3.0
231 E. 50th St. $$$ 231 E. 50th St. 3.0
DEGREZIA RESTAURANT $$$ Ristorante DeGrezia 3.0
validated  3
validated  4
validated  3
Baskin Robbins $$$ Baskin-Robbins 1.0
10007 $$$ 10007 1.0
DEGREZIA RESTAURANT $$$ Ristorante DeGrezia 0.8461538461538461
10022 $$$ 10022 0.8461538461538461
b'El Maguey y la Tuna' $$$ b'El Maguey y La Tuna' 0.9545454545454546
2124733919 $$$ 2124733744 0.9545454545454546
b'Baskin Robbins' $$$ b'Baskin-Robbins' 0.9411764

In [27]:
predictions = pd.DataFrame.from_dict(result, orient = 'index')
predictions.reset_index(inplace = True)
predictions.columns = ["locu_id", "foursquare_id"]
predictions.to_csv("train_predictions.csv", encoding='utf-8', index = False)

# test generalization
###### knn: precision: 0.98, recall: 0.9527777777777777, F1-score: 0.9661971830985915
###### svc: precision: 0.98, recall: 0.9527777777777777, F1-score: 0.9661971830985915

In [71]:
# match dictionary
def match_to_dict(match):
    match = match.set_index("locu_id")
    match = match.to_dict()
    
    compare_match = match["foursquare_id"]
    return compare_match
match_dict = match_to_dict(match)

In [72]:
def matchRecords(match_dict, compare_predict):
    matched = {}
    Nmatched = {}
    match_count = 0
    Nmatch_count = 0
    keylist = match_dict.keys()
    locu = []
    fq = []
    
    for key, value in compare_predict.items():
        if key in keylist and match_dict[key] == value:
            match_count +=1          
            matched[key] = value
            locu.append(key)
            fq.append(value)
        else:
            Nmatch_count += 1
            Nmatched[key] = value
            
    precision = match_count*1.0 /(len(compare_predict.keys()))
    recall = match_count*1.0 /(len(keylist))
    
    F1 = 2*precision*recall /(precision + recall)
    print ("matched records: {}".format(match_count))
    print ("not matched records: {}".format(Nmatch_count))
    
    print ("precision: {}, recall: {}, F1-score: {}".format(precision, recall, F1))
    return locu, fq, matched, Nmatched

In [73]:
train_result = final(train_locu, train_foursquare)

finish normalization
finish preprocessing
finish normalization
finish preprocessing
validated  260
validated  70
214 West 39th St. $$$ 214 W. 39th St. 3.0
Enrico Ferezi Inc $$$ Enrico Ferezi 3.0
127 W. 28th St. $$$ 127 W. 28th St. 3.0
Blu NYC $$$ Blu NYC 3.0
9 E. 18th St. $$$ 9 E. 18th St. 3.0
Rosa Mexicano - Union Square $$$ Rosa Mexicano 3.0
250 East Houston St. $$$ 250 E. Houston St. 3.0
Dunkin' Donuts $$$ Dunkin Donuts 3.0
684 Ave. Of The Americas $$$ 684 Ave. Of The Americas 3.0
Starbucks $$$ Starbucks 3.0
validated  3
validated  9
validated  7
Azul Argentine Bistro $$$ Azul Bistro 1.0
10002 $$$ 10002 1.0
Rosa Mexicano - Union Square $$$ Rosa Mexicano 1.0
10003 $$$ 10003 1.0
Dunkin' Donuts $$$ Dunkin Donuts 1.0
10025 $$$ 10025 1.0
b'Little Town' $$$ b'Littletown' 0.8571428571428572
10036 $$$  0.8571428571428572
5
265
335
342
347
348
350


In [74]:
matched_locu, matched_fq, matched, Nmatched = matchRecords(match_dict, train_result)

matched records: 343
not matched records: 7
precision: 0.98, recall: 0.9527777777777777, F1-score: 0.9661971830985915


In [None]:

# knn: precision: 0.98, recall: 0.9527777777777777, F1-score: 0.9661971830985915