In [1]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import time
import random
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from collections import Counter

import numpy as np
import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
## Parameters
is_debug = False
SEED = 2022
num_neighbors = 25
num_split = 3

threshold = 2

feat_columns = ['name', 'address', 'city', 'state', 'zip', 'url', 'phone', 'categories', 'country']
vec_columns = ['name', 'categories', 'address', 'state', 'url', 'country']

recall_columns = ['name', 'address', 'categories', 'address', 'phone']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [3]:
%load_ext Cython

In [4]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [5]:
def recall_simple(df, threshold):
    val2id_d = {}
    for col in recall_columns:
        temp_df = df[['id', col]]
        temp_df[col] = temp_df[col].str.lower()
        val2id = temp_df.groupby(col)['id'].apply(set).to_dict()
        val2id_d[col] = val2id
        del val2id
    
    cus_ids = []
    match_ids = []
    for vals in tqdm(df[recall_columns + ['id']].fillna('null').values):
        cus_id = vals[-1]
        match_id = []
        
        rec_match_count = []
        for i in range(len(recall_columns)):
            col = recall_columns[i]
            
            if vals[i] != 'null':
                rec_match_count += list(val2id_d[col][vals[i].lower()])
        rec_match_count = dict(Counter(rec_match_count))
        
        for k, v in rec_match_count.items():
            if v > threshold:
                match_id.append(k)
        
        cus_ids += [cus_id] * len(match_id)
        match_ids += match_id
    
    train_df = pd.DataFrame()
    train_df['id'] = cus_ids
    train_df['match_id'] = match_ids
    train_df = train_df.drop_duplicates()
    del cus_ids, match_ids
    
    num_data = len(train_df)
    num_data_per_id = num_data / train_df['id'].nunique()
    print('Num of data: %s' % num_data)
    print('Num of data per id: %s' % num_data_per_id)
    
    return train_df

In [6]:
def recall_knn(df, Neighbors = 10):
    print('Start knn grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)

        for k in range(neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    print('Start knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(Neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                                 on = ['id', 'match_id'],
                                 how = 'outer')
    del train_df_country
    
    return train_df

In [7]:
def add_features(df):    
    for col in tqdm(feat_columns):    
        if col in vec_columns:
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]                    
            df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        
        geshs = []
        levens = []
        jaros = []
        lcss = []
        for s, match_s in zip(col_values, matcol_values):
            if s != 'nan' and match_s != 'nan':                    
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
        
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven'] / \
                                    df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
            
    return df

In [8]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def analysis(df):
    print('Num of data: %s' % len(df))
    print('Num of unique id: %s' % df['id'].nunique())
    print('Num of unique poi: %s' % df['point_of_interest'].nunique())
    
    poi_grouped = df.groupby('point_of_interest')['id'].count().reset_index()
    print('Mean num of unique poi: %s' % poi_grouped['id'].mean())

In [9]:
## Data load
data_root = '../src/data/raw/'  #'../input/foursquare-location-matching'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))

if is_debug:
    data = data.sample(n = 10000, random_state = SEED)
    data = data.reset_index(drop = True)

In [10]:
## Data split
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(data, data['point_of_interest'], data['point_of_interest'])):
    data.loc[val_idx, 'set'] = i

print('Num of train data: %s' % len(data))
print(data['set'].value_counts())

Num of train data: 1138812
1.0    569406
0.0    569406
Name: set, dtype: int64


In [11]:
valid_data = data[data['set'] == 0]
train_data = data[data['set'] == 1]

print('Train data: ')
analysis(train_data)
print('Valid data: ')
analysis(valid_data)

train_poi = train_data['point_of_interest'].unique().tolist()
valid_poi = valid_data['point_of_interest'].unique().tolist()

print(set(train_poi) & set(valid_poi))

train_ids = train_data['id'].unique().tolist()
valid_ids = valid_data['id'].unique().tolist()
      
print(set(train_ids) & set(valid_ids))

Train data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369987
Mean num of unique poi: 1.5389892077289202
Valid data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369985
Mean num of unique poi: 1.5389975269267673
set()
set()


In [12]:
train_data.head().T

Unnamed: 0,0,10,11,12,13
id,E_000001272c6c5d,E_00009ab517afac,E_0000c362229d93,E_0000c566a81ea1,E_0000d9e584ed9f
name,Café Stad Oudenaarde,Starbucks,Coffee Cat,つじ田 味噌の章,Signature Properties Savannah
latitude,50.86,26.3052,7.08222,35.6948,32.0126
longitude,3.6342,50.1294,125.61,139.767,-81.1132
address,Abdijstraat,Ibis Avenue,F. Torres St.,神田小川町1-1,100 Commercial Ct Ste C
city,Nederename,Dhahran,Davao City,千代田区,Savannah
state,Oost-Vlaanderen,Ash Sharqiyah,Davao Region,東京都,GA
zip,9700,34465,8000,101-0052,31406
country,BE,SA,PH,JP,US
url,,,,https://tsukemen-tsujita.com,http://www.oursignatureproperties.com


In [13]:
tv_ids_d = {}
tv_ids_d['train_ids'] = train_ids
tv_ids_d['valid_ids'] = valid_ids

np.save('tv_ids_d.npy', tv_ids_d)

del train_data, valid_data
gc.collect()

data = data.set_index('id')

data = data.loc[tv_ids_d['valid_ids']]

data = data.reset_index()

In [14]:
data.shape

(569406, 14)

In [15]:
data.head().T

Unnamed: 0,0,1,2,3,4
id,E_000002eae2a589,E_000007f24ebc95,E_000008a8ba4f48,E_00001d92066153,E_000023d8f4be44
name,Carioca Manero,ร้านตัดผมการาเกด,Turkcell,Restaurante Casa Cofiño,Island Spa
latitude,-22.9072,13.7808,37.8445,43.3382,14.519
longitude,-43.1782,100.485,27.8442,-4.32682,121.019
address,,,Adnan Menderes Bulvarı,,"5th Flr, Newport Mall, Resorts World Manila"
city,,,,Caviedes,Pasay City
state,,,,Cantabria,Metro Manila
zip,,,,,
country,BR,TH,TR,ES,PH
url,,,,,


In [16]:
id2index_d = dict(zip(data['id'].values, data.index))

tfidf_d = {}
for col in vec_columns:
    tfidf = TfidfVectorizer()
    tv_fit = tfidf.fit_transform(data[col].fillna('nan'))
    tfidf_d[col] = tv_fit

In [17]:
tfidf_d['name']

<569406x236887 sparse matrix of type '<class 'numpy.float64'>'
	with 1514513 stored elements in Compressed Sparse Row format>

In [18]:
## Train data generated by Simple recall & knn recall
train_data_simple = recall_simple(data, threshold)
train_data = recall_knn(data, num_neighbors)

print('train data by knn: %s' % len(train_data))
train_data = train_data.merge(train_data_simple,
                             on = ['id', 'match_id'],
                             how = 'outer')
del train_data_simple
gc.collect()

  0%|          | 0/569406 [00:00<?, ?it/s]

Num of data: 1629646
Num of data per id: 4.371133522879674
Start knn grouped by country


  0%|          | 0/209 [00:00<?, ?it/s]

Start knn
train data by knn: 17512775


0

In [19]:
data = data.set_index('id')
ids = train_data['id'].tolist()
match_ids = train_data['match_id'].tolist()

poi = data.loc[ids]['point_of_interest'].values
match_poi = data.loc[match_ids]['point_of_interest'].values

train_data['label'] = np.array(poi == match_poi, dtype = np.int8)
del poi, match_poi, ids, match_ids
gc.collect()

print('Num of unique id: %s' % train_data['id'].nunique())
print('Num of train data: %s' % len(train_data))
print('Pos rate: %s' % train_data['label'].mean())

Num of unique id: 569406
Num of train data: 18699888
Pos rate: 0.05506578435122178


In [20]:
train_data.head().T

Unnamed: 0,0,1,2,3,4
id,E_000002eae2a589,E_000007f24ebc95,E_000008a8ba4f48,E_00001d92066153,E_000023d8f4be44
match_id,E_000002eae2a589,E_000007f24ebc95,E_000008a8ba4f48,E_00001d92066153,E_000023d8f4be44
kdist,0,0,0,0,0
kneighbors,0,0,0,0,0
kdist_country,0,0,0,0,0
kneighbors_country,0,0,0,0,0
label,1,1,1,1,1


In [21]:
train_data[train_data.label==0].head(5)

Unnamed: 0,id,match_id,kdist,kneighbors,kdist_country,kneighbors_country,label
140,E_000fa08832fbd7,E_e5bd519bd1b61d,0.0,0.0,,,0
778,E_005c564d80c2d1,E_e5bd519bd1b61d,0.0,0.0,,,0
1029,E_007a34ca5fb6d3,E_5b4396318ac53b,0.0,0.0,0.0,0.0,0
1582,E_00bd2a9a2d54c8,E_e5bd519bd1b61d,0.0,0.0,,,0
1583,E_00bd5998b9ed72,E_0ebc85d1d48dfe,0.0,0.0,0.0,0.0,0


In [22]:
train_data[train_data.label==1].head(5)

Unnamed: 0,id,match_id,kdist,kneighbors,kdist_country,kneighbors_country,label
0,E_000002eae2a589,E_000002eae2a589,0.0,0.0,0.0,0.0,1
1,E_000007f24ebc95,E_000007f24ebc95,0.0,0.0,0.0,0.0,1
2,E_000008a8ba4f48,E_000008a8ba4f48,0.0,0.0,0.0,0.0,1
3,E_00001d92066153,E_00001d92066153,0.0,0.0,0.0,0.0,1
4,E_000023d8f4be44,E_000023d8f4be44,0.0,0.0,0.0,0.0,1


In [23]:
train_data[(train_data.label==1) & (train_data.kneighbors_country!=0)].sort_values('kneighbors_country', 
                                                                                   ascending=False)

Unnamed: 0,id,match_id,kdist,kneighbors,kdist_country,kneighbors_country,label
13690756,E_0b47aaa3227130,E_4356d020434f24,0.001035,24.0,0.001029,24.0,1
3297823,E_ca7eb5bb8b5174,E_cc93e1538bd4b6,0.002723,5.0,0.002717,24.0,1
2403184,E_38678f9c207393,E_86edbe0b37748c,0.026454,4.0,0.015331,24.0,1
13179282,E_2535c5e81006e4,E_1ce84e047a0629,0.003206,23.0,0.003180,24.0,1
6029771,E_96fb30aa63694d,E_1407649d730060,0.001954,10.0,0.001319,24.0,1
...,...,...,...,...,...,...,...
18699348,E_ffd5b7e47fa771,E_f648eb732a9b68,,,,,1
18699349,E_ffd731c74d2e55,E_67945774ea95f6,,,,,1
18699353,E_ffe3613bd707e2,E_28c1bf1b71f8bd,,,,,1
18699411,E_ffeab9916a8282,E_e9856dd6894336,,,,,1


In [24]:
train_data[train_data.id=='E_aedb9fd786db78']

Unnamed: 0,id,match_id,kdist,kneighbors,kdist_country,kneighbors_country,label
388994,E_aedb9fd786db78,E_aedb9fd786db78,0.0,0.0,0.0,0.0,1
958400,E_aedb9fd786db78,E_8896b2a4d2cc56,0.002241,1.0,,,1
1527806,E_aedb9fd786db78,E_bb7f3126ac9fbb,0.002709,2.0,0.000208,1.0,0
2097212,E_aedb9fd786db78,E_e246dd47d28212,0.00275,3.0,,,0
2666618,E_aedb9fd786db78,E_f8ef07a183e92d,0.002785,4.0,,,0
3236024,E_aedb9fd786db78,E_f41cf6fccd3214,0.002926,5.0,0.000852,19.0,1
3805430,E_aedb9fd786db78,E_b74439bb8bcc07,0.002939,6.0,,,0
4374836,E_aedb9fd786db78,E_d86e913c582b9b,0.002942,7.0,,,1
4944242,E_aedb9fd786db78,E_6d23b038e5e55f,0.002986,8.0,,,0
5513648,E_aedb9fd786db78,E_10038f0ddc3728,0.003061,9.0,,,0


In [25]:
## Eval
data = data.reset_index()

id2poi = get_id2poi(data)
poi2ids = get_poi2ids(data)

eval_df = pd.DataFrame()
eval_df['id'] = data['id'].unique().tolist()
eval_df['match_id'] = eval_df['id']
print('Unique id: %s' % len(eval_df))

eval_df_ = train_data[train_data['label'] == 1][['id', 'match_id']]
eval_df = pd.concat([eval_df, eval_df_])

eval_df = eval_df.groupby('id')['match_id'].\
                        apply(list).reset_index()
eval_df['matches'] = eval_df['match_id'].apply(lambda x: ' '.join(set(x)))
print('Unique id: %s' % len(eval_df))

iou_score = get_score(eval_df)
print('IoU score: %s' % iou_score)

Unique id: 569406
Unique id: 569406
IoU score: 0.9326851184966598


In [26]:
count = 0
start_row = 0

data = data.set_index('id')
unique_id = train_data['id'].unique().tolist()
num_split_id = len(unique_id) // num_split

In [27]:
num_split_id, len(unique_id), num_split

(189802, 569406, 3)

In [28]:
## Add features
for k in range(1, num_split + 1):
    print('Current split: %s' % k)
    end_row = start_row + num_split_id
    if k < num_split:
        cur_id = unique_id[start_row : end_row]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    else:
        cur_id = unique_id[start_row: ]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    
    cur_data = add_features(cur_data)
    print(cur_data.shape)
    print(cur_data.sample(1))
    
    cur_data.to_csv('../src/data/processed/valid_data_sr_25n_%s.csv' % k, index = False)    
    start_row = end_row
    count += len(cur_data)
    
    del cur_data
    gc.collect()
    
print(count)

Current split: 1


  0%|          | 0/9 [00:00<?, ?it/s]

(6218388, 77)
                       id          match_id     kdist  kneighbors  \
6430031  E_4ae0be2aee4cac  E_4caa5f20770a2a  1.333412        11.0   

         kdist_country  kneighbors_country  label  name_sim  name_gesh  \
6430031            NaN                 NaN      0       0.0   0.210526   

         name_leven  name_jaro  name_lcs  name_len_diff  name_nleven  \
6430031          14   0.465278         2             13        0.875   

         name_nlcsk  name_nlcs  address_sim  address_gesh  address_leven  \
6430031       0.125   0.666667          1.0           NaN            NaN   

         address_jaro  address_lcs  address_len_diff  address_nleven  \
6430031           NaN          NaN                 0             NaN   

         address_nlcsk  address_nlcs  city_gesh  city_leven  city_jaro  \
6430031            NaN           NaN        NaN         NaN        NaN   

         city_lcs  city_len_diff  city_nleven  city_nlcsk  city_nlcs  \
6430031       NaN              0  

  0%|          | 0/9 [00:00<?, ?it/s]

(6249423, 77)
                        id          match_id     kdist  kneighbors  \
12738253  E_5f144e604e8f96  E_8a94dea72690c3  0.005785        22.0   

          kdist_country  kneighbors_country  label  name_sim  name_gesh  \
12738253       0.005779                21.0      0       0.0   0.074074   

          name_leven  name_jaro  name_lcs  name_len_diff  name_nleven  \
12738253          14   0.454545         3              5        0.875   

          name_nlcsk  name_nlcs  address_sim  address_gesh  address_leven  \
12738253    0.272727     0.1875          0.0      0.105263           20.0   

          address_jaro  address_lcs  address_len_diff  address_nleven  \
12738253      0.403704          2.0                 2             1.0   

          address_nlcsk  address_nlcs  city_gesh  city_leven  city_jaro  \
12738253            0.1      0.111111   0.272727        10.0   0.505556   

          city_lcs  city_len_diff  city_nleven  city_nlcsk  city_nlcs  \
12738253       3.0   

  0%|          | 0/9 [00:00<?, ?it/s]

(6232077, 77)
                       id          match_id     kdist  kneighbors  \
2204402  E_deddbc3347071e  E_5b26e9ccbd9a92  0.025768         3.0   

         kdist_country  kneighbors_country  label  name_sim  name_gesh  \
2204402       0.025435                 3.0      0  0.494134   0.415094   

         name_leven  name_jaro  name_lcs  name_len_diff  name_nleven  \
2204402          22   0.532289        11              1     0.814815   

         name_nlcsk  name_nlcs  address_sim  address_gesh  address_leven  \
2204402    0.407407   0.423077          0.0       0.12766           27.0   

         address_jaro  address_lcs  address_len_diff  address_nleven  \
2204402      0.453128          4.0                13             0.9   

         address_nlcsk  address_nlcs  city_gesh  city_leven  city_jaro  \
2204402       0.235294      0.133333        1.0         0.0        1.0   

         city_lcs  city_len_diff  city_nleven  city_nlcsk  city_nlcs  \
2204402      11.0              0  