In [1]:
## Imports
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import time
import random
import Levenshtein
import difflib
import multiprocessing
import pandas as pd
from collections import Counter

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import numpy as np
import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler

from fuzzywuzzy import fuzz
from fuzzywuzzy.fuzz import WRatio, partial_ratio, QRatio, token_set_ratio, token_sort_ratio, partial_token_sort_ratio

In [2]:
## Parameters
is_debug = False
SEED = 2022
num_neighbors = 25
num_split = 3

threshold = 2

feat_columns = ['name', 'address', 'city', 'state', 'zip', 'url', 'phone', 'categories', 'country']
vec_columns = ['name', 'categories', 'address', 'state', 'url', 'country']

recall_columns = ['name', 'address', 'categories', 'address', 'phone']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [3]:
%load_ext Cython

In [4]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [5]:
def recall_simple(df, threshold):
    val2id_d = {}
    for col in recall_columns:
        temp_df = df[['id', col]]
        temp_df[col] = temp_df[col].str.lower()
        val2id = temp_df.groupby(col)['id'].apply(set).to_dict()
        val2id_d[col] = val2id
        del val2id
    
    cus_ids = []
    match_ids = []
    for vals in tqdm(df[recall_columns + ['id']].fillna('null').values):
        cus_id = vals[-1]
        match_id = []
        
        rec_match_count = []
        for i in range(len(recall_columns)):
            col = recall_columns[i]
            
            if vals[i] != 'null':
                rec_match_count += list(val2id_d[col][vals[i].lower()])
        rec_match_count = dict(Counter(rec_match_count))
        
        for k, v in rec_match_count.items():
            if v > threshold:
                match_id.append(k)
        
        cus_ids += [cus_id] * len(match_id)
        match_ids += match_id
    
    train_df = pd.DataFrame()
    train_df['id'] = cus_ids
    train_df['match_id'] = match_ids
    train_df = train_df.drop_duplicates()
    del cus_ids, match_ids
    
    num_data = len(train_df)
    num_data_per_id = num_data / train_df['id'].nunique()
    print('Num of data: %s' % num_data)
    print('Num of data per id: %s' % num_data_per_id)
    
    return train_df

In [6]:
def recall_knn(df, Neighbors = 10):
    print('Start knn grouped by country')
    
    scaler = MinMaxScaler()    
    features = [['latitude','longitude']]
    for feature in features:
        df[feature] = scaler.fit_transform(df[feature])
    
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        neighbors = min(len(country_df), Neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                    metric = 'haversine',
                                    n_jobs = -1)
        
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                        return_distance = True)
    
        for k in range(neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            train_df_country.append(cur_df)
    train_df_country = pd.concat(train_df_country)
    
    print('Start knn')
    train_df = []
    knn = NearestNeighbors(n_neighbors = Neighbors)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(Neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country,
                                 on = ['id', 'match_id'],
                                 how = 'outer')
    del train_df_country
    
    return train_df

In [7]:
def add_features(df):    
    for col in tqdm(feat_columns):    
        if col in vec_columns:
            tv_fit = tfidf_d[col]
            indexs = [id2index_d[i] for i in df['id']]
            match_indexs = [id2index_d[i] for i in df['match_id']]                    
            df[f'{col}_sim'] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        
        geshs = []
        levens = []
        jaros = []
        lcss = []
        for s, match_s in zip(col_values, matcol_values):
            if s != 'nan' and match_s != 'nan':                    
                geshs.append(difflib.SequenceMatcher(None, s, match_s).ratio())
                levens.append(Levenshtein.distance(s, match_s))
                jaros.append(Levenshtein.jaro_winkler(s, match_s))
                lcss.append(LCS(str(s), str(match_s)))
            else:
                geshs.append(np.nan)
                levens.append(np.nan)
                jaros.append(np.nan)
                lcss.append(np.nan)
        
        df[f'{col}_gesh'] = geshs
        df[f'{col}_leven'] = levens
        df[f'{col}_jaro'] = jaros
        df[f'{col}_lcs'] = lcss
        
        if col not in ['phone', 'zip']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])
            df[f'{col}_nleven'] = df[f'{col}_leven']/df[[f'{col}_len', f'match_{col}_len']].max(axis = 1)
            
            df[f'{col}_nlcsk'] = df[f'{col}_lcs'] / df[f'match_{col}_len']
            df[f'{col}_nlcs'] = df[f'{col}_lcs'] / df[f'{col}_len']
            
            df = df.drop(f'{col}_len', axis = 1)
            df = df.drop(f'match_{col}_len', axis = 1)
            gc.collect()
            
    return df

In [8]:
def add_fuzz_features(df):
    
    for col in tqdm(['name', 'categories', 'address']):       
        
        col_values = data.loc[df['id']][col].values.astype(str)
        matcol_values = data.loc[df['match_id']][col].values.astype(str)
        w_ratio = []
        partialratio = []
        tokenset_ratio = []
        tokensort_ratio = []
        fuzz_power = []
        
        for s, match_s in zip(col_values, matcol_values):
            if s != 'nan' and match_s != 'nan':                    

                s = str(s).lower().split()
                match_s = str(match_s).lower().split()
                a, b = WRatio(s, match_s), partial_ratio(s, match_s)
                c, d = token_set_ratio(s, match_s), token_sort_ratio(s, match_s)
                w_ratio.append(a)
                partialratio.append(b)
                tokenset_ratio.append(c)
                tokensort_ratio.append(d)
                fuzz_power.append((a+b+c+d)/4.)
            else:
                w_ratio.append(np.nan)
                partialratio.append(np.nan)
                tokenset_ratio.append(np.nan)
                tokensort_ratio.append(np.nan)
                fuzz_power.append(np.nan)

        df[f'{col}_w_ratio'] = w_ratio
        df[f'{col}_partial_ratio'] = partialratio
        df[f'{col}_tokenset_ratio'] = tokenset_ratio
        df[f'{col}_tokensort_ratio'] = tokensort_ratio
        df[f'{col}_fuzz_power'] = fuzz_power
            
        gc.collect()
            
    return df

In [9]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def analysis(df):
    print('Num of data: %s' % len(df))
    print('Num of unique id: %s' % df['id'].nunique())
    print('Num of unique poi: %s' % df['point_of_interest'].nunique())
    
    poi_grouped = df.groupby('point_of_interest')['id'].count().reset_index()
    print('Mean num of unique poi: %s' % poi_grouped['id'].mean())

In [10]:
## Data load
data_root = '../src/data/raw/'  #'../input/foursquare-location-matching'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))

if is_debug:
    data = data.sample(n = 10000, random_state = SEED)
    data = data.reset_index(drop = True)

In [11]:
## Data split
kf = GroupKFold(n_splits=2)
for i, (trn_idx, val_idx) in enumerate(kf.split(data, data['point_of_interest'], data['point_of_interest'])):
    data.loc[val_idx, 'set'] = i

print('Num of train data: %s' % len(data))
print(data['set'].value_counts())

Num of train data: 1138812
1.0    569406
0.0    569406
Name: set, dtype: int64


In [12]:
valid_data = data[data['set'] == 0]
train_data = data[data['set'] == 1]

print('Train data: ')
analysis(train_data)
print('Valid data: ')
analysis(valid_data)

train_poi = train_data['point_of_interest'].unique().tolist()
valid_poi = valid_data['point_of_interest'].unique().tolist()

print(set(train_poi) & set(valid_poi))

train_ids = train_data['id'].unique().tolist()
valid_ids = valid_data['id'].unique().tolist()
      
print(set(train_ids) & set(valid_ids))

Train data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369987
Mean num of unique poi: 1.5389892077289202
Valid data: 
Num of data: 569406
Num of unique id: 569406
Num of unique poi: 369985
Mean num of unique poi: 1.5389975269267673
set()
set()


In [13]:
train_data.head().T

Unnamed: 0,0,10,11,12,13
id,E_000001272c6c5d,E_00009ab517afac,E_0000c362229d93,E_0000c566a81ea1,E_0000d9e584ed9f
name,Café Stad Oudenaarde,Starbucks,Coffee Cat,つじ田 味噌の章,Signature Properties Savannah
latitude,50.86,26.3052,7.08222,35.6948,32.0126
longitude,3.6342,50.1294,125.61,139.767,-81.1132
address,Abdijstraat,Ibis Avenue,F. Torres St.,神田小川町1-1,100 Commercial Ct Ste C
city,Nederename,Dhahran,Davao City,千代田区,Savannah
state,Oost-Vlaanderen,Ash Sharqiyah,Davao Region,東京都,GA
zip,9700,34465,8000,101-0052,31406
country,BE,SA,PH,JP,US
url,,,,https://tsukemen-tsujita.com,http://www.oursignatureproperties.com


In [14]:
tv_ids_d = {}
tv_ids_d['train_ids'] = train_ids
tv_ids_d['valid_ids'] = valid_ids

np.save('tv_ids_d.npy', tv_ids_d)

del train_data, valid_data
gc.collect()

data = data.set_index('id')
data = data.loc[tv_ids_d['valid_ids']]
data = data.reset_index()

In [15]:
data.shape

(569406, 14)

In [16]:
data.head().T

Unnamed: 0,0,1,2,3,4
id,E_000002eae2a589,E_000007f24ebc95,E_000008a8ba4f48,E_00001d92066153,E_000023d8f4be44
name,Carioca Manero,ร้านตัดผมการาเกด,Turkcell,Restaurante Casa Cofiño,Island Spa
latitude,-22.9072,13.7808,37.8445,43.3382,14.519
longitude,-43.1782,100.485,27.8442,-4.32682,121.019
address,,,Adnan Menderes Bulvarı,,"5th Flr, Newport Mall, Resorts World Manila"
city,,,,Caviedes,Pasay City
state,,,,Cantabria,Metro Manila
zip,,,,,
country,BR,TH,TR,ES,PH
url,,,,,


In [17]:
id2index_d = dict(zip(data['id'].values, data.index))

tfidf_d = {}
for col in vec_columns:
    tfidf = TfidfVectorizer()
    tv_fit = tfidf.fit_transform(data[col].fillna('nan'))
    tfidf_d[col] = tv_fit

In [18]:
## Train data generated by Simple recall & knn recall
train_data_simple = recall_simple(data, threshold)
train_data = recall_knn(data, num_neighbors)

print('train data by knn: %s' % len(train_data))
train_data = train_data.merge(train_data_simple,
                             on = ['id', 'match_id'],
                             how = 'outer')
del train_data_simple
gc.collect()

  0%|          | 0/569406 [00:00<?, ?it/s]

Num of data: 1629646
Num of data per id: 4.371133522879674
Start knn grouped by country


  0%|          | 0/209 [00:00<?, ?it/s]

Start knn
train data by knn: 15335108


0

In [19]:
data = data.set_index('id')
ids = train_data['id'].tolist()
match_ids = train_data['match_id'].tolist()

poi = data.loc[ids]['point_of_interest'].values
match_poi = data.loc[match_ids]['point_of_interest'].values

train_data['label'] = np.array(poi == match_poi, dtype = np.int8)
del poi, match_poi, ids, match_ids
gc.collect()

print('Num of unique id: %s' % train_data['id'].nunique())
print('Num of train data: %s' % len(train_data))
print('Pos rate: %s' % train_data['label'].mean())

Num of unique id: 569406
Num of train data: 16523029
Pos rate: 0.06196739108791736


In [20]:
train_data[train_data.id=='E_aedb9fd786db78']

Unnamed: 0,id,match_id,kdist,kneighbors,kdist_country,kneighbors_country,label
388994,E_aedb9fd786db78,E_aedb9fd786db78,0.0,0.0,0.0,0.0,1
958400,E_aedb9fd786db78,E_bb7f3126ac9fbb,8e-06,1.0,5e-06,1.0,0
1527806,E_aedb9fd786db78,E_9fe25a8d6808f2,1e-05,2.0,7e-06,2.0,0
2097212,E_aedb9fd786db78,E_f41cf6fccd3214,1e-05,3.0,8e-06,4.0,1
2666618,E_aedb9fd786db78,E_439232714b3cab,1.1e-05,4.0,7e-06,3.0,0
3236024,E_aedb9fd786db78,E_4ea2d39771a6b3,1.3e-05,5.0,1.1e-05,9.0,0
3805430,E_aedb9fd786db78,E_9a9089b7fc56e3,1.3e-05,6.0,1.1e-05,8.0,0
4374836,E_aedb9fd786db78,E_3187838ce1f611,1.3e-05,7.0,1e-05,5.0,0
4944242,E_aedb9fd786db78,E_72d4c7a8a79d07,1.4e-05,8.0,1.1e-05,11.0,0
5513648,E_aedb9fd786db78,E_2fb2b30d0fec65,1.4e-05,9.0,1.1e-05,12.0,0


In [21]:
## Eval
data = data.reset_index()

id2poi = get_id2poi(data)
poi2ids = get_poi2ids(data)

eval_df = pd.DataFrame()
eval_df['id'] = data['id'].unique().tolist()
eval_df['match_id'] = eval_df['id']
print('Unique id: %s' % len(eval_df))

eval_df_ = train_data[train_data['label'] == 1][['id', 'match_id']]
eval_df = pd.concat([eval_df, eval_df_])

eval_df = eval_df.groupby('id')['match_id'].\
                        apply(list).reset_index()
eval_df['matches'] = eval_df['match_id'].apply(lambda x: ' '.join(set(x)))
print('Unique id: %s' % len(eval_df))

iou_score = get_score(eval_df)
print('IoU score: %s' % iou_score)

Unique id: 569406
Unique id: 569406
IoU score: 0.9301209613791936


In [22]:
count = 0
start_row = 0

data = data.set_index('id')
unique_id = train_data['id'].unique().tolist()
num_split_id = len(unique_id) // num_split

In [23]:
num_split_id, len(unique_id), num_split

(189802, 569406, 3)

In [24]:
## Add features
for k in range(1, num_split + 1):
    print('Current split: %s' % k)
    end_row = start_row + num_split_id
    if k < num_split:
        cur_id = unique_id[start_row : end_row]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    else:
        cur_id = unique_id[start_row: ]
        cur_data = train_data[train_data['id'].isin(cur_id)]
    
    cur_data = add_features(cur_data)
    print(cur_data.shape)
    cur_data = add_fuzz_features(cur_data)
    print(cur_data.shape)
    print(cur_data.sample(1))
    
    cur_data.to_csv('../src/data/processed/valid_data_sr_scaled_15n_%s.csv' % k, index = False)    
    start_row = end_row
    count += len(cur_data)
    
    del cur_data
    gc.collect()
    
print(count)

Current split: 1


  0%|          | 0/9 [00:00<?, ?it/s]

(5493835, 77)


  0%|          | 0/3 [00:00<?, ?it/s]

(5493835, 92)
                       id          match_id     kdist  kneighbors  \
5738072  E_13d3a95ba261cf  E_83739522d5bfec  0.000006        10.0   

         kdist_country  kneighbors_country  label  name_sim  name_gesh  \
5738072       0.000005                 8.0      0       0.0        0.0   

         name_leven  name_jaro  name_lcs  name_len_diff  name_nleven  \
5738072           9        0.0         0              6          1.0   

         name_nlcsk  name_nlcs  address_sim  address_gesh  address_leven  \
5738072         0.0        0.0          0.0      0.323529           32.0   

         address_jaro  address_lcs  address_len_diff  address_nleven  \
5738072      0.540126         11.0                12             0.8   

         address_nlcsk  address_nlcs  city_gesh  city_leven  city_jaro  \
5738072       0.392857         0.275        1.0         0.0        1.0   

         city_lcs  city_len_diff  city_nleven  city_nlcsk  city_nlcs  \
5738072       5.0              0  

  0%|          | 0/9 [00:00<?, ?it/s]

(5523338, 77)


  0%|          | 0/3 [00:00<?, ?it/s]

(5523338, 92)
                       id          match_id     kdist  kneighbors  \
7169889  E_9791b895395614  E_5887bd54169e71  0.000027        12.0   

         kdist_country  kneighbors_country  label  name_sim  name_gesh  \
7169889        0.00002                10.0      0       0.0   0.121212   

         name_leven  name_jaro  name_lcs  name_len_diff  name_nleven  \
7169889          20   0.441304         3             13     0.869565   

         name_nlcsk  name_nlcs  address_sim  address_gesh  address_leven  \
7169889         0.3   0.130435          0.0           NaN            NaN   

         address_jaro  address_lcs  address_len_diff  address_nleven  \
7169889           NaN          NaN                 4             NaN   

         address_nlcsk  address_nlcs  city_gesh  city_leven  city_jaro  \
7169889            NaN           NaN        NaN         NaN        NaN   

         city_lcs  city_len_diff  city_nleven  city_nlcsk  city_nlcs  \
7169889       NaN              4  

  0%|          | 0/9 [00:00<?, ?it/s]

(5505856, 77)


  0%|          | 0/3 [00:00<?, ?it/s]

(5505856, 92)
                        id          match_id  kdist  kneighbors  \
14306560  E_f8562bb26dbb97  E_9885c86b506819    NaN         NaN   

          kdist_country  kneighbors_country  label  name_sim  name_gesh  \
14306560       0.000186                12.0      0       0.0   0.064516   

          name_leven  name_jaro  name_lcs  name_len_diff  name_nleven  \
14306560          25   0.358025         2             23     0.925926   

          name_nlcsk  name_nlcs  address_sim  address_gesh  address_leven  \
14306560         0.5   0.074074          0.0      0.344828           13.0   

          address_jaro  address_lcs  address_len_diff  address_nleven  \
14306560      0.570261          5.0                 5        0.764706   

          address_nlcsk  address_nlcs  city_gesh  city_leven  city_jaro  \
14306560       0.294118      0.416667        1.0         0.0        1.0   

          city_lcs  city_len_diff  city_nleven  city_nlcsk  city_nlcs  \
14306560       6.0         