In [32]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import time
import random
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import numpy as np
import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

from fuzzywuzzy import fuzz
from fuzzywuzzy.fuzz import WRatio, partial_ratio, QRatio, token_set_ratio, token_sort_ratio, partial_token_sort_ratio

In [2]:
## Parameters
is_debug = False
SEED = 2022
num_neighbors = 25
num_split = 3
feat_columns = ['name', 'address', 'city', 'state', 'zip', 'url', 'phone', 'categories', 'country']
vec_columns = ['name', 'categories', 'address', 'state', 'url', 'country']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [3]:
%load_ext Cython

In [4]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [50]:
def recall_knn(df, Neighbors = 10):
    print('Start knn grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)

        for k in range(neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    print('Start knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(Neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                                 on = ['id', 'match_id'],
                                 how = 'outer')
    del train_df_country
    
    return train_df

In [94]:
def add_features(df):    
    for col in tqdm(['name', 'categories', 'address','url']):       
#         if col in vec_columns:
#             tv_fit = tfidf_d[col]
#             indexs = [id2index_d[i] for i in df['id']]
#             match_indexs = [id2index_d[i] for i in df['match_id']]                    
#             df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        
#         geshs = []
#         levens = []
#         jaros = []
#         lcss = []
        w_ratio = []
        partialratio = []
        tokenset_ratio = []
        tokensort_ratio = []
        fuzz_power = []
        
        for s, match_s in zip(col_values, matcol_values):
            if s != 'nan' and match_s != 'nan':                    
#                 geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
#                 levens.append(Levenshtein.distance(s, match_s))
#                 jaros.append(Levenshtein.jaro_winkler(s, match_s))
#                 lcss.append(LCS(str(s), str(match_s)))
                
                if col in ['name', 'categories', 'address','url']:
                
                    s = str(s).lower().split()
                    match_s = str(match_s).lower().split()
                    a, b = WRatio(s, match_s), partial_ratio(s, match_s)
                    c, d = token_set_ratio(s, match_s), token_sort_ratio(s, match_s)
                    w_ratio.append(a)
                    partialratio.append(b)
                    tokenset_ratio.append(c)
                    tokensort_ratio.append(d)
                    fuzz_power.append((a+b+c+d)/4.)
            else:
#                 geshs.append(np.nan)
#                 levens.append(np.nan)
#                 jaros.append(np.nan)
#                 lcss.append(np.nan)
                
                if col in ['name', 'categories', 'address','url']:
                    w_ratio.append(np.nan)
                    partialratio.append(np.nan)
                    tokenset_ratio.append(np.nan)
                    tokensort_ratio.append(np.nan)
                    fuzz_power.append(np.nan)
        
#         df[f'{col}_gesh'] = geshs
#         df[f'{col}_leven'] = levens
#         df[f'{col}_jaro'] = jaros
#         df[f'{col}_lcs'] = lcss
        
        if col in ['name', 'categories', 'address','url']:
            df[f'{col}_w_ratio'] = w_ratio
            df[f'{col}_partial_ratio'] = partialratio
            df[f'{col}_tokenset_ratio'] = tokenset_ratio
            df[f'{col}_tokensort_ratio'] = tokensort_ratio
            df[f'{col}_fuzz_power'] = fuzz_power
        
#         if col not in ['phone', 'zip']:
#             df[f'{col}_len'] = list(map(len, col_values))
#             df[f'match_{col}_len'] = list(map(len, matcol_values)) 
#             df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
#             df[f'{col}_nleven'] = df[f'{col}_leven']/df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
#             df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
#             df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
#             df = df.drop(f'{col}_len', axis = 1)
#             df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
            
    return df

In [10]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def analysis(df):
    print('Num of data: %s' % len(df))
    print('Num of unique id: %s' % df['id'].nunique())
    print('Num of unique poi: %s' % df['point_of_interest'].nunique())
    
    poi_grouped = df.groupby('point_of_interest')['id'].count().reset_index()
    print('Mean num of unique poi: %s' % poi_grouped['id'].mean())

In [58]:
## Data load
data_root = '../src/data/raw/'  #'../input/foursquare-location-matching'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))

if is_debug:
    data = data.sample(n = 10000, random_state = SEED)
    data = data.reset_index(drop = True)

In [59]:
## Data split
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(data, data['point_of_interest'], data['point_of_interest'])):
    data.loc[val_idx, 'set'] = i

print('Num of train data: %s' % len(data))
print(data['set'].value_counts())

Num of train data: 1138812
1.0    569406
0.0    569406
Name: set, dtype: int64


In [60]:
valid_data = data[data['set'] == 0]
train_data = data[data['set'] == 1]

print('Train data: ')
analysis(train_data)
print('Valid data: ')
analysis(valid_data)

train_poi = train_data['point_of_interest'].unique().tolist()
valid_poi = valid_data['point_of_interest'].unique().tolist()

print(set(train_poi) & set(valid_poi))

train_ids = train_data['id'].unique().tolist()
valid_ids = valid_data['id'].unique().tolist()
      
print(set(train_ids) & set(valid_ids))

Train data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369987
Mean num of unique poi: 1.5389892077289202
Valid data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369985
Mean num of unique poi: 1.5389975269267673
set()
set()


In [61]:
train_data.head().T

Unnamed: 0,0,10,11,12,13
id,E_000001272c6c5d,E_00009ab517afac,E_0000c362229d93,E_0000c566a81ea1,E_0000d9e584ed9f
name,Café Stad Oudenaarde,Starbucks,Coffee Cat,つじ田 味噌の章,Signature Properties Savannah
latitude,50.86,26.3052,7.08222,35.6948,32.0126
longitude,3.6342,50.1294,125.61,139.767,-81.1132
address,Abdijstraat,Ibis Avenue,F. Torres St.,神田小川町1-1,100 Commercial Ct Ste C
city,Nederename,Dhahran,Davao City,千代田区,Savannah
state,Oost-Vlaanderen,Ash Sharqiyah,Davao Region,東京都,GA
zip,9700,34465,8000,101-0052,31406
country,BE,SA,PH,JP,US
url,,,,https://tsukemen-tsujita.com,http://www.oursignatureproperties.com


In [62]:
tv_ids_d = {}
tv_ids_d['train_ids'] = train_ids
tv_ids_d['valid_ids'] = valid_ids

np.save('tv_ids_d.npy', tv_ids_d)

del train_data, valid_data
gc.collect()

data = data.set_index('id')
data = data.loc[tv_ids_d['train_ids']]
data = data.reset_index()

In [63]:
data.shape

(569406, 14)

In [64]:
data.head().T

Unnamed: 0,0,1,2,3,4
id,E_000001272c6c5d,E_00009ab517afac,E_0000c362229d93,E_0000c566a81ea1,E_0000d9e584ed9f
name,Café Stad Oudenaarde,Starbucks,Coffee Cat,つじ田 味噌の章,Signature Properties Savannah
latitude,50.86,26.3052,7.08222,35.6948,32.0126
longitude,3.6342,50.1294,125.61,139.767,-81.1132
address,Abdijstraat,Ibis Avenue,F. Torres St.,神田小川町1-1,100 Commercial Ct Ste C
city,Nederename,Dhahran,Davao City,千代田区,Savannah
state,Oost-Vlaanderen,Ash Sharqiyah,Davao Region,東京都,GA
zip,9700,34465,8000,101-0052,31406
country,BE,SA,PH,JP,US
url,,,,https://tsukemen-tsujita.com,http://www.oursignatureproperties.com


In [65]:
id2index_d = dict(zip(data['id'].values, data.index))

tfidf_d = {}
for col in vec_columns:
    data[col] = data[col].apply(lambda x: str(x).lower())
    tfidf = TfidfVectorizer()
    tv_fit = tfidf.fit_transform(data[col].fillna('nan'))
    tfidf_d[col] = tv_fit

In [66]:
tfidf_d['name']

<569406x235452 sparse matrix of type '<class 'numpy.float64'>'
	with 1519083 stored elements in Compressed Sparse Row format>

In [67]:
## Train data generated by knn
train_data = recall_knn(data, num_neighbors)

data = data.set_index('id')
ids = train_data['id'].tolist()
match_ids = train_data['match_id'].tolist()

poi = data.loc[ids]['point_of_interest'].values
match_poi = data.loc[match_ids]['point_of_interest'].values

train_data['label'] = np.array(poi == match_poi, dtype = np.int8)
del poi, match_poi, ids, match_ids
gc.collect()

print('Num of unique id: %s' % train_data['id'].nunique())
print('Num of train data: %s' % len(train_data))
print('Pos rate: %s' % train_data['label'].mean())

Start knn grouped by country


  0%|          | 0/211 [00:00<?, ?it/s]

Start knn
Num of unique id: 569406
Num of train data: 17517870
Pos rate: 0.05871781215410321


In [68]:
train_data.head().T

Unnamed: 0,0,1,2,3,4
id,E_000001272c6c5d,E_00009ab517afac,E_0000c362229d93,E_0000c566a81ea1,E_0000d9e584ed9f
match_id,E_000001272c6c5d,E_00009ab517afac,E_0000c362229d93,E_0000c566a81ea1,E_0000d9e584ed9f
kdist,0,0,0,0,0
kneighbors,0,0,0,0,0
kdist_country,0,0,0,0,0
kneighbors_country,0,0,0,0,0
label,1,1,1,1,1


In [69]:
train_data[train_data.label==0].head(5)

Unnamed: 0,id,match_id,kdist,kneighbors,kdist_country,kneighbors_country,label
261,E_0020ff8f3b06f9,E_241b3370406f67,0.0,0.0,,,0
351,E_002c767fde5d45,E_15d9faca67fdad,0.0,0.0,0.0,4.0,0
1571,E_00bbd9375a59eb,E_241b3370406f67,0.0,0.0,,,0
1680,E_00c720383fcdb5,E_cf5434e5494ebf,0.0,0.0,0.0,1.0,0
3635,E_01a3bbea9cd02f,E_caf447b497ce4e,0.0,0.0,0.0,1.0,0


In [70]:
train_data[train_data.label==1].head(5)

Unnamed: 0,id,match_id,kdist,kneighbors,kdist_country,kneighbors_country,label
0,E_000001272c6c5d,E_000001272c6c5d,0.0,0.0,0.0,0.0,1
1,E_00009ab517afac,E_00009ab517afac,0.0,0.0,0.0,0.0,1
2,E_0000c362229d93,E_0000c362229d93,0.0,0.0,0.0,0.0,1
3,E_0000c566a81ea1,E_0000c566a81ea1,0.0,0.0,0.0,0.0,1
4,E_0000d9e584ed9f,E_0000d9e584ed9f,0.0,0.0,0.0,0.0,1


In [71]:
train_data[(train_data.label==1) & (train_data.kneighbors_country!=0)].sort_values('kneighbors_country', 
                                                                                   ascending=False)

Unnamed: 0,id,match_id,kdist,kneighbors,kdist_country,kneighbors_country,label
14069913,E_b5718f72673423,E_e689b48b1bcd06,0.001637,24.0,0.001621,24.0,1
13972412,E_89bfaf6070a2c9,E_b25a1e3dfe530d,0.001979,24.0,0.001978,24.0,1
13967267,E_87639bd6272f5f,E_7d1728f9a87f19,0.002221,24.0,0.002015,24.0,1
13968798,E_8815d9e0e6ae8d,E_c6bc3d5d4e5d4c,0.013413,24.0,0.013238,24.0,1
13180252,E_259aaf0834c347,E_3bd346cb41335e,0.011254,23.0,0.011158,24.0,1
...,...,...,...,...,...,...,...
14232246,E_feadd6466cea48,E_00a9d8dc5f1346,0.000697,24.0,,,1
14234133,E_ff88521befa92f,E_874e0d1caae1a9,0.007058,24.0,,,1
14234727,E_ffcf513c14b1be,E_e29e56db7686c2,0.015216,24.0,,,1
14234737,E_ffd0811db4def9,E_c627769e240422,0.001931,24.0,,,1


In [72]:
train_data[train_data.id=='E_1d029a2db783ca']

Unnamed: 0,id,match_id,kdist,kneighbors,kdist_country,kneighbors_country,label
64791,E_1d029a2db783ca,E_1d029a2db783ca,0.0,0.0,0.0,0.0,1
634197,E_1d029a2db783ca,E_a234ddf642795c,0.000599,1.0,0.000514,1.0,1
1203603,E_1d029a2db783ca,E_44917cedfbd5ec,0.00091,2.0,0.000838,3.0,1
1773009,E_1d029a2db783ca,E_57ade5df7f4692,0.001057,3.0,0.000817,2.0,1
2342415,E_1d029a2db783ca,E_e0e9af0727107a,0.003404,4.0,0.002632,4.0,0
2911821,E_1d029a2db783ca,E_fb3ec221a5f80c,0.003678,5.0,0.002939,5.0,0
3481227,E_1d029a2db783ca,E_98a0cd6756099e,0.005317,6.0,0.004119,6.0,0
4050633,E_1d029a2db783ca,E_8f68217136805f,0.005907,7.0,0.005546,7.0,0
4620039,E_1d029a2db783ca,E_24a99e772366b1,0.006802,8.0,0.005554,8.0,0
5189445,E_1d029a2db783ca,E_35fc9db907dbb7,0.007284,9.0,0.005871,9.0,0


In [73]:
## Eval
data = data.reset_index()

id2poi = get_id2poi(data)
poi2ids = get_poi2ids(data)

eval_df = pd.DataFrame()
eval_df['id'] = data['id'].unique().tolist()
eval_df['match_id'] = eval_df['id']
print('Unique id: %s' % len(eval_df))

eval_df_ = train_data[train_data['label'] == 1][['id', 'match_id']]
eval_df = pd.concat([eval_df, eval_df_])

eval_df = eval_df.groupby('id')['match_id'].\
                        apply(list).reset_index()
eval_df['matches'] = eval_df['match_id'].apply(lambda x: ' '.join(set(x)))
print('Unique id: %s' % len(eval_df))

iou_score = get_score(eval_df)
print('IoU score: %s' % iou_score)

Unique id: 569406
Unique id: 569406
IoU score: 0.9291118121717638


In [76]:
count = 0
start_row = 0

data = data.set_index('id')
unique_id = train_data['id'].unique().tolist()
num_split_id = len(unique_id) // num_split

In [77]:
num_split_id, len(unique_id), num_split

(189802, 569406, 3)

In [100]:
## Add features
for k in range(1, num_split + 1):
    print('Current split: %s' % k)
    end_row = start_row + num_split_id
    if k < num_split:
        cur_id = unique_id[start_row : end_row]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    else:
        cur_id = unique_id[start_row: ]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    
    cur_data = add_features(cur_data)
    print(cur_data.shape)
    print(cur_data.sample(1))
    
    cur_data.to_csv('../src/data/processed/train_fuzz_data_25n_%s.csv' % k, index = False)    
    start_row = end_row
    count += len(cur_data)
    
    del cur_data
    gc.collect()
    
print(count)

Current split: 2


  0%|          | 0/4 [00:00<?, ?it/s]

(5848974, 27)
                       id          match_id     kdist  kneighbors  \
2073966  E_a438f2d2fc51d8  E_fccab7b0aa8ace  0.001111         3.0   

         kdist_country  kneighbors_country  label  name_w_ratio  \
2073966       0.001111                 3.0      0          45.0   

         name_partial_ratio  name_tokenset_ratio  name_tokensort_ratio  \
2073966                42.0                 24.0                  24.0   

         name_fuzz_power  categories_w_ratio  categories_partial_ratio  \
2073966            33.75                56.0                      64.0   

         categories_tokenset_ratio  categories_tokensort_ratio  \
2073966                       47.0                        47.0   

         categories_fuzz_power  address_w_ratio  address_partial_ratio  \
2073966                   53.5              NaN                    NaN   

         address_tokenset_ratio  address_tokensort_ratio  address_fuzz_power  \
2073966                     NaN                     

  0%|          | 0/4 [00:00<?, ?it/s]

(5831684, 27)
                       id          match_id     kdist  kneighbors  \
3963439  E_f5e6966812dd21  E_3d1830b79beab7  0.005119         6.0   

         kdist_country  kneighbors_country  label  name_w_ratio  \
3963439       0.003185                 6.0      0          41.0   

         name_partial_ratio  name_tokenset_ratio  name_tokensort_ratio  \
3963439                43.0                 43.0                  43.0   

         name_fuzz_power  categories_w_ratio  categories_partial_ratio  \
3963439             42.5                 NaN                       NaN   

         categories_tokenset_ratio  categories_tokensort_ratio  \
3963439                        NaN                         NaN   

         categories_fuzz_power  address_w_ratio  address_partial_ratio  \
3963439                    NaN              NaN                    NaN   

         address_tokenset_ratio  address_tokensort_ratio  address_fuzz_power  \
3963439                     NaN                     