In [1]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import time
import random
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
from collections import Counter
import unidecode
from math import radians, cos, sin, asin, sqrt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

import numpy as np
import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler

from fuzzywuzzy import fuzz
from fuzzywuzzy.fuzz import WRatio, partial_ratio, QRatio, token_set_ratio, token_sort_ratio, partial_token_sort_ratio

In [2]:
## Parameters
is_debug = False
SEED = 2022
num_neighbors = 25
num_split = 3

threshold = 2

feat_columns = ['name', 'address', 'city', 'state', 'zip', 'url', 'phone', 'categories', 'country']
vec_columns = ['name', 'categories', 'address', 'state', 'url', 'country']

recall_columns = ['name', 'address', 'categories', 'zip', 'phone']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [3]:
%load_ext Cython

In [4]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [5]:
def recall_simple(df, threshold):
    
    val2id_d = {}
    for col in recall_columns:
        temp_df = df[['id', col]]
        temp_df[col] = temp_df[col].str.lower()
        val2id = temp_df.groupby(col)['id'].apply(set).to_dict()
        val2id_d[col] = val2id
        del val2id
    
    cus_ids = []
    match_ids = []
    for vals in tqdm(df[recall_columns + ['id']].fillna('null').values):
        cus_id = vals[-1]
        match_id = []
        
        rec_match_count = []
        for i in range(len(recall_columns)):
            col = recall_columns[i]
            
            if vals[i] != 'null':
                rec_match_count += list(val2id_d[col][vals[i].lower()])
        rec_match_count = dict(Counter(rec_match_count))
        
        for k, v in rec_match_count.items():
            if v > threshold:
                match_id.append(k)
        
        cus_ids += [cus_id] * len(match_id)
        match_ids += match_id
    
    train_df = pd.DataFrame()
    train_df['id'] = cus_ids
    train_df['match_id'] = match_ids
    train_df = train_df.drop_duplicates()
    del cus_ids, match_ids
    
    num_data = len(train_df)
    num_data_per_id = num_data / train_df['id'].nunique()
    print('Num of data: %s' % num_data)
    print('Num of data per id: %s' % num_data_per_id)
    
    return train_df

In [6]:
def recall_knn(df, Neighbors = 10):
    print('Start knn grouped by country')
        
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)
    
        for k in range(neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    print('Start knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(Neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                                 on = ['id', 'match_id'],
                                 how = 'outer')
    del train_df_country
    
    return train_df

In [7]:
def add_features(df):
    
    for col in tqdm(feat_columns):    
        if col in vec_columns:
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]                    
            df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        
        geshs = []
        levens = []
        jaros = []
        lcss = []
        for s, match_s in zip(col_values, matcol_values):
            if s != 'nan' and match_s != 'nan':                    
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
        
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven']/df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
            
    return df

In [8]:
def add_fuzz_features(df):
    
    for col in tqdm(['name', 'categories', 'address']):       
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        w_ratio = []
        partialratio = []
        tokenset_ratio = []
        tokensort_ratio = []
        fuzz_power = []
        
        for s, match_s in zip(col_values, matcol_values):
            if s != 'nan' and match_s != 'nan':                    

                s = str(s).lower().split()
                match_s = str(match_s).lower().split()
                a, b = WRatio(s, match_s), partial_ratio(s, match_s)
                c, d = token_set_ratio(s, match_s), token_sort_ratio(s, match_s)
                w_ratio.append(a)
                partialratio.append(b)
                tokenset_ratio.append(c)
                tokensort_ratio.append(d)
                fuzz_power.append((a+b+c+d)/4.)
            else:
                w_ratio.append(np.nan)
                partialratio.append(np.nan)
                tokenset_ratio.append(np.nan)
                tokensort_ratio.append(np.nan)
                fuzz_power.append(np.nan)

        df[f'{col}_w_ratio'] = w_ratio
        df[f'{col}_partial_ratio'] = partialratio
        df[f'{col}_tokenset_ratio'] = tokenset_ratio
        df[f'{col}_tokensort_ratio'] = tokensort_ratio
        df[f'{col}_fuzz_power'] = fuzz_power
            
        gc.collect()
            
    return df

In [9]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def analysis(df):
    print('Num of data: %s' % len(df))
    print('Num of unique id: %s' % df['id'].nunique())
    print('Num of unique poi: %s' % df['point_of_interest'].nunique())
    
    poi_grouped = df.groupby('point_of_interest')['id'].count().reset_index()
    print('Mean num of unique poi: %s' % poi_grouped['id'].mean())

In [10]:
## Data load
data_root = '../src/data/raw/'  #'../input/foursquare-location-matching'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))

if is_debug:
    data = data.sample(n = 10000, random_state = SEED)
    data = data.reset_index(drop = True)

In [13]:
pat = r"\'|\,|-|\"|\.|\(|\)|#|!"

def clean_string(s):
    s = str(s)
    s = unidecode.unidecode(s)
    s = re.sub(pat, "", s)
    s = s.strip().lower()
    s = " ".join([s1.rstrip("s") for s1 in re.split("-|\/|\s+", s) if s1 != ""])
    return s

data['name'] = data['name'].apply(lambda x: clean_string(x)).replace('nan', np.nan)
data['address'] = data['address'].apply(lambda x: clean_string(x)).replace('nan', np.nan)
data['categories'] = data['categories'].apply(lambda x: clean_string(x)).replace('nan', np.nan)
data['city'] = data['city'].apply(lambda x: clean_string(x)).replace('nan', np.nan)
data['state'] = data['state'].apply(lambda x: clean_string(x)).replace('nan', np.nan)

In [18]:
def get_addresses(add1, city1, state1):
    
    add1, city1, state1 = map(lambda x : str(x).lower(), [add1, city1, state1])

    address1 = ""
    address1 += add1 if add1 != "nan" else ""
    address1 += (" " + city1) if (city1 != "nan" and city1 not in address1) else ""
    address1 += (" " + state1) if (state1 != "nan" and state1 not in address1) else ""

    return address1.strip()

In [23]:
data['address'] = data.apply(lambda x: get_addresses(x.address, x.city, x.state), axis=1).replace("", np.nan)

In [24]:
## Data split
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(data, data['point_of_interest'], data['point_of_interest'])):
    data.loc[val_idx, 'set'] = i

print('Num of train data: %s' % len(data))
print(data['set'].value_counts())

Num of train data: 1138812
1.0    569406
0.0    569406
Name: set, dtype: int64


In [25]:
valid_data = data[data['set'] == 0]
train_data = data[data['set'] == 1]

print('Train data: ')
analysis(train_data)
print('Valid data: ')
analysis(valid_data)

train_poi = train_data['point_of_interest'].unique().tolist()
valid_poi = valid_data['point_of_interest'].unique().tolist()

print(set(train_poi) & set(valid_poi))

train_ids = train_data['id'].unique().tolist()
valid_ids = valid_data['id'].unique().tolist()
      
print(set(train_ids) & set(valid_ids))

Train data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369987
Mean num of unique poi: 1.5389892077289202
Valid data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369985
Mean num of unique poi: 1.5389975269267673
set()
set()


In [26]:
train_data.head().T

Unnamed: 0,0,10,11,12,13
id,E_000001272c6c5d,E_00009ab517afac,E_0000c362229d93,E_0000c566a81ea1,E_0000d9e584ed9f
name,cafe stad oudenaarde,starbuck,coffee cat,tsuzitian wei cheng nozhang,signature propertie savannah
latitude,50.86,26.3052,7.08222,35.6948,32.0126
longitude,3.6342,50.1294,125.61,139.767,-81.1132
address,abdijstraat nederename oostvlaanderen,ibi avenue dhahran ash sharqiyah,f torre st davao city davao region,shen tian xiao chuan ting 11 qian dai tian qu ...,100 commercial ct ste c savannah ga
city,nederename,dhahran,davao city,qian dai tian qu,savannah
state,oostvlaanderen,ash sharqiyah,davao region,dong jing du,ga
zip,9700,34465,8000,101-0052,31406
country,BE,SA,PH,JP,US
url,,,,https://tsukemen-tsujita.com,http://www.oursignatureproperties.com


In [27]:
# data['poi_count'] = data[['point_of_interest', 'id']].groupby('point_of_interest').transform('count')
# country_poi_count_mean = data[['country', 'poi_count']].groupby('country').mean().poi_count.to_dict()

In [28]:
country_poi_count_mean = {'AD': 1.5, 'AE': 2.4052763819095477, 'AF': 1.736842105263158, 'AG': 1.9230769230769231, 'AI': 1.5, 'AL': 2.086206896551724, 'AM': 1.7575757575757576, 'AN': 2.0, 'AO': 1.1333333333333333, 'AQ': 4.0588235294117645, 'AR': 2.8563273073263558, 'AT': 1.5969145569620253, 'AU': 1.6723131400133984, 'AW': 1.7301587301587302, 'AX': 2.772727272727273, 'AZ': 2.315270935960591, 'BA': 1.6779661016949152, 'BB': 1.423529411764706, 'BD': 1.702127659574468, 'BE': 1.954533131946865, 'BF': 2.0, 'BG': 1.8947688564476886, 'BH': 1.7752808988764044, 'BI': 2.0, 'BJ': 2.0, 'BL': 1.8421052631578947, 'BM': 2.769230769230769, 'BN': 2.4435146443514646, 'BO': 1.7311827956989247, 'BQ': 2.0, 'BR': 1.5900743545207938, 'BS': 1.8953488372093024, 'BT': 1.8, 'BV': 22.0, 'BW': 1.5714285714285714, 'BY': 3.0266449157150626, 'BZ': 1.793103448275862, 'CA': 1.6390542466672255, 'CD': 1.7272727272727273, 'CH': 1.7539646579066606, 'CI': 1.2222222222222223, 'CL': 1.41397021058038, 'CM': 1.9230769230769231, 'CN': 1.919543317662861, 'CO': 1.597900113507378, 'CR': 1.6617552850736708, 'CU': 2.6666666666666665, 'CV': 2.0, 'CW': 1.6666666666666667, 'CY': 2.1671232876712327, 'CZ': 1.7573170731707317, 'DE': 1.9297142536349856, 'DJ': 2.0, 'DK': 1.5699361578641904, 'DM': 1.8333333333333333, 'DO': 2.071622846781505, 'DZ': 1.9411764705882353, 'EC': 1.5290322580645161, 'EE': 2.17078410311493, 'EG': 2.094106463878327, 'EH': 2.0, 'ES': 1.7126828135881762, 'ET': 1.9565217391304348, 'EU': 2.0, 'FI': 2.1810370817003317, 'FJ': 1.8, 'FO': 1.0, 'FR': 2.331875264568929, 'GA': 22.0, 'GB': 1.8165518321327905, 'GD': 2.076923076923077, 'GE': 1.937062937062937, 'GF': 3.6470588235294117, 'GG': 2.625, 'GH': 1.75, 'GI': 2.125, 'GL': 4.0, 'GM': 2.0, 'GP': 1.8837209302325582, 'GQ': 2.0, 'GR': 1.9984615384615385, 'GT': 1.2957110609480813, 'GU': 1.806282722513089, 'GW': 2.0, 'GY': 2.0, 'HK': 1.9196217494089836, 'HN': 1.592274678111588, 'HR': 1.5374358974358975, 'HT': 1.411764705882353, 'HU': 1.80519877675841, 'ID': 6.722706595905989, 'IE': 1.7040816326530612, 'IL': 1.5358649789029535, 'IM': 1.8095238095238095, 'IN': 1.676926388291003, 'IQ': 2.308641975308642, 'IR': 2.0693175711982805, 'IS': 1.78, 'IT': 1.7978938130759106, 'JE': 1.7272727272727273, 'JM': 1.674641148325359, 'JO': 1.8235294117647058, 'JP': 1.8663610920721956, 'KE': 1.690176322418136, 'KG': 1.858695652173913, 'KH': 3.638483965014577, 'KN': 1.75, 'KP': 1.9285714285714286, 'KR': 2.0393985435112905, 'KW': 3.8779330345373055, 'KY': 2.966666666666667, 'KZ': 1.7391304347826086, 'LA': 1.927007299270073, 'LB': 1.4796238244514106, 'LC': 1.8571428571428572, 'LI': 2.6, 'LK': 1.9126984126984128, 'LR': 2.0, 'LT': 1.630801687763713, 'LU': 1.6685714285714286, 'LV': 6.234220532319392, 'LY': 1.6, 'MA': 1.5993788819875776, 'MC': 1.9090909090909092, 'MD': 1.8104089219330854, 'ME': 2.024822695035461, 'MF': 1.8, 'MG': 1.7692307692307692, 'MK': 1.6367076631977293, 'MM': 1.966887417218543, 'MN': 1.6666666666666667, 'MO': 4.197080291970803, 'MP': 2.1724137931034484, 'MQ': 2.1815286624203822, 'MT': 2.1822033898305087, 'MU': 1.631578947368421, 'MV': 2.210062893081761, 'MW': 2.076923076923077, 'MX': 1.8840904842026547, 'MY': 1.9365367180417044, 'MZ': 1.4814814814814814, 'NC': 1.8, 'NE': 2.0, 'NG': 1.9076923076923078, 'NI': 1.5954198473282444, 'NL': 1.9143005315700765, 'NO': 1.8300720906282184, 'NP': 1.5441176470588236, 'NZ': 1.584664536741214, 'OM': 1.75, 'PA': 1.538160469667319, 'PE': 1.7525083612040133, 'PF': 1.7692307692307692, 'PG': 1.0, 'PH': 2.4424706943192067, 'PK': 1.510204081632653, 'PL': 1.5832037325038881, 'PM': 2.0, 'PR': 1.8465473145780051, 'PS': 1.6, 'PT': 1.5239852398523985, 'PW': 2.0, 'PY': 1.6924577373211964, 'QA': 2.331983805668016, 'RE': 1.64, 'RO': 1.728950403690888, 'RS': 1.8838348495451365, 'RU': 3.6284061020515517, 'RW': 2.0, 'SA': 2.181374804453851, 'SC': 1.6875, 'SD': 1.7567567567567568, 'SE': 1.6901325478645066, 'SG': 1.9864794096643656, 'SH': 2.0, 'SI': 1.637223974763407, 'SJ': 2.0, 'SK': 1.7174840085287846, 'SL': 1.0, 'SM': 1.8461538461538463, 'SN': 1.6, 'SO': 2.0, 'SR': 1.8, 'SS': 1.0, 'ST': 2.0, 'SV': 1.438095238095238, 'SX': 1.1818181818181819, 'SY': 2.0, 'SZ': 1.8571428571428572, 'TC': 1.5454545454545454, 'TG': 2.0, 'TH': 2.088967487416678, 'TJ': 2.0, 'TL': 2.0, 'TM': 2.2285714285714286, 'TN': 1.6764132553606237, 'TO': 3.0, 'TR': 3.821040659159381, 'TT': 1.5870646766169154, 'TW': 1.915327833260964, 'TZ': 2.361111111111111, 'UA': 2.1022911597729115, 'UG': 1.96, 'US': 1.8704848257530047, 'UY': 1.580281690140845, 'UZ': 2.2212389380530975, 'VA': 3.1666666666666665, 'VC': 2.076923076923077, 'VE': 1.478783026421137, 'VG': 3.142857142857143, 'VI': 1.8076923076923077, 'VN': 1.7237534840507898, 'VU': 22.0, 'WS': 1.4705882352941178, 'XK': 1.7, 'XX': 4.5, 'YE': 6.75, 'YT': 1.0, 
'ZA': 1.5082995204721505, 'ZM': 1.9210526315789473, 'ZW': 4.571428571428571}


In [29]:
def shorten(x):
    if len(x.split()) > 2:
        return ["".join([x1[0] for x1 in x.split()]), x]
    else:
        return [x]
    
def get_aka(x):
    x = str(x)
    aka =  re.findall(r"\(.+\)", x)
    if len(aka) > 0:
        for a in aka:
            x = x.replace(a, "")
        aka = list(map(clean_string,aka))
        
        return [ s1 for s in map(shorten, aka + [clean_string(x)]) for s1 in s]
    aka = x.split("-")
    if len(aka) > 1:
        aka = list(map(clean_string, aka))
        return [s1 for s in map(shorten, aka + [clean_string(x)]) for s1 in s]
    
    aka = x.split("/")
    if len(aka) > 1:
        aka = list(map(clean_string, aka))
        return [s1 for s in map(shorten, aka + [clean_string(x)]) for s1 in s]
    
    
    return [s1 for s in map(shorten, [clean_string(x)]) for s1 in s]

In [31]:
%%time

class NameMatcher:
    def ngrams(self, string):
        string = clean_string(string)
        for word in ['the']:
            string = (" " + string + " ").replace(" " + word + " ", " ").strip()
        string = string.replace("&", "and")
        string = "".join(string.split())
        ngrams = []
        for n in [2,3]:
            _ngrams = zip(*[string[i:] for i in range(n)])
            _ngrams = ["".join(ngram) for ngram in _ngrams]
            ngrams += list(_ngrams)
        return ngrams
    def fit(self, names):
        self.vectorizer = TfidfVectorizer(min_df = 1, analyzer=self.ngrams)
        self.vectorizer.fit(names)
        return self
    def predict(self, list1, list2, batch_size = 512):
        assert len(list1) == len(list2)
        results = [self.predict_batch(list1[i : i+batch_size], list2[i : i + batch_size])
                           for i in tqdm(range(0, len(list1), batch_size))]
        results = [r for result in results for r in result]
        return np.array(results)
   
    def predict_batch(self, list1, list2):
        list1 = [get_aka(x) for x in list1]
        list2 = [get_aka(x) for x in list2]
        
        id1 = [i for i, x in enumerate(list1) for x1 in x]
        id2 = [i for i, x in enumerate(list2) for x1 in x]
        list1 = [x1 for i, x in enumerate(list1) for x1 in x]
        list2 = [x1 for i, x in enumerate(list2) for x1 in x]
        
        df1 = pd.DataFrame({"id": id1, "name1" : list1})
        df2 = pd.DataFrame({"id": id2, "name2" : list2})
        
        df = df1.merge(df2, on = 'id')
        
        mat1 = self.vectorizer.transform(df.name1)
        mat2 = self.vectorizer.transform(df.name2)
        match = cosine_similarity(mat1, mat2)
        match = list(np.diag(match))
        
        df['match'] = match
        return df[['id', 'match']].groupby('id').max().match.to_list()
    
class AddressMatcher:
    def ngrams(self, string):
        string = clean_string(string)
        for word in ['ave', 'avenue', 'st', 'street', 'rd', 'road', 'blvd', 'boulevard', 'bulevar', 'bulevardul']:
            string = (string + " ").replace(" " + word + " ", " ").strip()
        ngrams = []
        for n in [1,2,3]:
            _ngrams = zip(*[string[i:] for i in range(n)])
            _ngrams = ["".join(ngram) for ngram in _ngrams]
            ngrams += list(_ngrams)
        return ngrams
    def fit(self, names):
        self.vectorizer = TfidfVectorizer(min_df = 1, analyzer=self.ngrams)
        self.vectorizer.fit(names)
        return self
    def predict(self, list1, list2, batch_size = 256):
        assert len(list1) == len(list2)
        results = [self.predict_batch(list1[i : i+batch_size], list2[i : i + batch_size])
                           for i in tqdm(range(0, len(list1), batch_size))]
        results = [r for result in results for r in result]
        return np.array(results)
    
   
    def predict_batch(self, list1, list2):
        mat1 = self.vectorizer.transform(list1)
        mat2 = self.vectorizer.transform(list2)
        match = cosine_similarity(mat1, mat2)
        match = list(np.diag(match))
        return match
        
name_matcher = NameMatcher()
name_matcher = name_matcher.fit(data.name.astype(str).tolist())

address_matcher = NameMatcher()
addresses = data["address"].fillna("").astype(str).to_list()
address_matcher = AddressMatcher().fit([address for address in addresses if address.strip() != ""])

CPU times: user 1min 22s, sys: 1.82 s, total: 1min 24s
Wall time: 1min 24s
Compiler : 240 ms


In [32]:
tv_ids_d = {}
tv_ids_d['train_ids'] = train_ids
tv_ids_d['valid_ids'] = valid_ids

np.save('tv_ids_d.npy', tv_ids_d)

del train_data, valid_data
gc.collect()

data = data.set_index('id')
data = data.loc[tv_ids_d['valid_ids']]
data = data.reset_index()

In [33]:
data.shape

(569406, 14)

In [34]:
data.head().T

Unnamed: 0,0,1,2,3,4
id,E_000001272c6c5d,E_00009ab517afac,E_0000c362229d93,E_0000c566a81ea1,E_0000d9e584ed9f
name,cafe stad oudenaarde,starbuck,coffee cat,tsuzitian wei cheng nozhang,signature propertie savannah
latitude,50.86,26.3052,7.08222,35.6948,32.0126
longitude,3.6342,50.1294,125.61,139.767,-81.1132
address,abdijstraat nederename oostvlaanderen,ibi avenue dhahran ash sharqiyah,f torre st davao city davao region,shen tian xiao chuan ting 11 qian dai tian qu ...,100 commercial ct ste c savannah ga
city,nederename,dhahran,davao city,qian dai tian qu,savannah
state,oostvlaanderen,ash sharqiyah,davao region,dong jing du,ga
zip,9700,34465,8000,101-0052,31406
country,BE,SA,PH,JP,US
url,,,,https://tsukemen-tsujita.com,http://www.oursignatureproperties.com


In [35]:
id2index_d = dict(zip(data['id'].values, data.index))

tfidf_d = {}
for col in vec_columns:
    tfidf = TfidfVectorizer()
    tv_fit = tfidf.fit_transform(data[col].fillna('nan'))
    tfidf_d[col] = tv_fit

In [36]:
## Train data generated by Simple recall & knn recall
train_data_simple = recall_simple(data, threshold)
train_data = recall_knn(data, num_neighbors)

print('train data by knn: %s' % len(train_data))
train_data = train_data.merge(train_data_simple,
                             on = ['id', 'match_id'],
                             how = 'outer')
del train_data_simple
gc.collect()

  0%|          | 0/569406 [00:00<?, ?it/s]

Num of data: 1835046
Num of data per id: 4.257963468285348
Start knn grouped by country


  0%|          | 0/210 [00:00<?, ?it/s]

Start knn
train data by knn: 17517870


0

In [37]:
data = data.set_index('id')
ids = train_data['id'].tolist()
match_ids = train_data['match_id'].tolist()

poi = data.loc[ids]['point_of_interest'].values
match_poi = data.loc[match_ids]['point_of_interest'].values

train_data['label'] = np.array(poi == match_poi, dtype = np.int8)
del poi, match_poi, ids, match_ids
gc.collect()

print('Num of unique id: %s' % train_data['id'].nunique())
print('Num of train data: %s' % len(train_data))
print('Pos rate: %s' % train_data['label'].mean())

Num of unique id: 569406
Num of train data: 18844701
Pos rate: 0.0549923291433491


In [38]:
train_data[train_data.id=='E_1d029a2db783ca']

Unnamed: 0,id,match_id,kdist,kneighbors,kdist_country,kneighbors_country,label
64791,E_1d029a2db783ca,E_1d029a2db783ca,0.0,0.0,0.0,0.0,1
634197,E_1d029a2db783ca,E_a234ddf642795c,0.000599,1.0,0.000514,1.0,1
1203603,E_1d029a2db783ca,E_44917cedfbd5ec,0.00091,2.0,0.000838,3.0,1
1773009,E_1d029a2db783ca,E_57ade5df7f4692,0.001057,3.0,0.000817,2.0,1
2342415,E_1d029a2db783ca,E_e0e9af0727107a,0.003404,4.0,0.002632,4.0,0
2911821,E_1d029a2db783ca,E_fb3ec221a5f80c,0.003678,5.0,0.002939,5.0,0
3481227,E_1d029a2db783ca,E_98a0cd6756099e,0.005317,6.0,0.004119,6.0,0
4050633,E_1d029a2db783ca,E_8f68217136805f,0.005907,7.0,0.005546,7.0,0
4620039,E_1d029a2db783ca,E_24a99e772366b1,0.006802,8.0,0.005554,8.0,0
5189445,E_1d029a2db783ca,E_35fc9db907dbb7,0.007284,9.0,0.005871,9.0,0


In [39]:
## Eval
data = data.reset_index()

id2poi = get_id2poi(data)
poi2ids = get_poi2ids(data)

eval_df = pd.DataFrame()
eval_df['id'] = data['id'].unique().tolist()
eval_df['match_id'] = eval_df['id']
print('Unique id: %s' % len(eval_df))

eval_df_ = train_data[train_data['label'] == 1][['id', 'match_id']]
eval_df = pd.concat([eval_df, eval_df_])

eval_df = eval_df.groupby('id')['match_id'].\
                        apply(list).reset_index()
eval_df['matches'] = eval_df['match_id'].apply(lambda x: ' '.join(set(x)))
print('Unique id: %s' % len(eval_df))

iou_score = get_score(eval_df)
print('IoU score: %s' % iou_score)

Unique id: 569406
Unique id: 569406
IoU score: 0.9337913779602858


In [46]:
train_data = train_data.merge(data[['id','country','name','address','latitude','longitude']].rename(
    columns={'name':'name_x','address':'address_x','latitude':'latitude_x','longitude':'longitude_x'}), on='id')

train_data = train_data.merge(data[['id','name','address','latitude','longitude']].rename(
    columns={'name':'name_y','address':'address_y','latitude':'latitude_y','longitude':'longitude_y'}), 
                              left_on='match_id', right_on='id').drop('id_y', axis=1)

train_data = train_data.rename(columns={'id_x':'id'})

In [54]:
train_data['latitude_x'] = train_data['latitude_x'].apply(lambda x : radians(x)).astype(np.float32)
train_data['longitude_x'] = train_data['longitude_x'].apply(lambda x : radians(x)).astype(np.float32)
train_data['latitude_y'] = train_data['latitude_y'].apply(lambda x : radians(x)).astype(np.float32)
train_data['longitude_y'] = train_data['longitude_y'].apply(lambda x : radians(x)).astype(np.float32)

train_data['dlat'] = abs(train_data['latitude_x'] - train_data['latitude_y']).astype(np.float32)
train_data['dlon'] = abs(train_data['longitude_x'] - train_data['longitude_y']).astype(np.float32)
train_data['country_x_poi_count_mean'] = train_data['country'].map(country_poi_count_mean).astype(np.float32)

In [55]:
train_data = train_data.drop(['country','latitude_x','latitude_y','longitude_x','longitude_y'], axis=1)

In [None]:
train_data['name_similarity'] = name_matcher.predict(train_data.name_x.tolist(), train_data.name_y.tolist())
train_data['name_similarity'][(train_data.name_x == "nan") | (train_data.name_y == "nan")] = np.nan
train_data['name_similarity'] = df['name_similarity'].astype(np.float32)

train_data['addrs_similarity'] = address_matcher.predict(train_data.address_x.tolist(), train_data.address_y.tolist())
train_data['addrs_similarity'][(train_data.address_x == "nan") | (train_data.address_y == "nan")] = np.nan
train_data['addrs_similarity'] = df['addrs_similarity'].astype(np.float32)

In [None]:
train_data = train_data.drop(['name_x','name_y','address_x','address_y'], axis=1)

In [None]:
count = 0
start_row = 0

data = data.set_index('id')
unique_id = train_data['id'].unique().tolist()
num_split_id = len(unique_id) // num_split

In [None]:
num_split_id, len(unique_id), num_split

In [None]:
## Add features
for k in range(1, num_split + 1):
    print('Current split: %s' % k)
    end_row = start_row + num_split_id
    if k < num_split:
        cur_id = unique_id[start_row : end_row]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    else:
        cur_id = unique_id[start_row: ]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    
    cur_data = add_features(cur_data)
    print(cur_data.shape)
    cur_data = add_fuzz_features(cur_data)
    print(cur_data.shape)
    print(cur_data.sample(1))
    
    cur_data.to_csv('../src/data/processed/valid_all_25n_%s.csv' % k, index = False)    
    start_row = end_row
    count += len(cur_data)
    
    del cur_data
    gc.collect()
    
print(count)